diff --git a/src/applications/diffusion/management/DiffusionRepositoryClusterManagementPanel.php b/src/applications/diffusion/management/DiffusionRepositoryClusterManagementPanel.php index 2017871453..22ecb3db3f 100644 --- a/src/applications/diffusion/management/DiffusionRepositoryClusterManagementPanel.php +++ b/src/applications/diffusion/management/DiffusionRepositoryClusterManagementPanel.php @@ -1,167 +1,196 @@ getRepository(); $viewer = $this->getViewer(); $service_phid = $repository->getAlmanacServicePHID(); if ($service_phid) { $service = id(new AlmanacServiceQuery()) ->setViewer($viewer) ->withServiceTypes( array( AlmanacClusterRepositoryServiceType::SERVICETYPE, )) ->withPHIDs(array($service_phid)) ->needBindings(true) ->executeOne(); if (!$service) { // TODO: Viewer may not have permission to see the service, or it may // be invalid? Raise some more useful error here? throw new Exception(pht('Unable to load cluster service.')); } } else { $service = null; } Javelin::initBehavior('phabricator-tooltips'); $rows = array(); if ($service) { $bindings = $service->getBindings(); $bindings = mgroup($bindings, 'getDevicePHID'); // This is an unusual read which always comes from the master. if (PhabricatorEnv::isReadOnly()) { $versions = array(); } else { $versions = PhabricatorRepositoryWorkingCopyVersion::loadVersions( $repository->getPHID()); } $versions = mpull($versions, null, 'getDevicePHID'); foreach ($bindings as $binding_group) { $all_disabled = true; foreach ($binding_group as $binding) { if (!$binding->getIsDisabled()) { $all_disabled = false; break; } } $any_binding = head($binding_group); if ($all_disabled) { $binding_icon = 'fa-times grey'; $binding_tip = pht('Disabled'); } else { $binding_icon = 'fa-folder-open green'; $binding_tip = pht('Active'); } $binding_icon = id(new PHUIIconView()) ->setIcon($binding_icon) ->addSigil('has-tooltip') ->setMetadata( array( 'tip' => $binding_tip, )); $device = $any_binding->getDevice(); $version = idx($versions, $device->getPHID()); if ($version) { $version_number = $version->getRepositoryVersion(); $version_number = phutil_tag( 'a', array( 'href' => "/diffusion/pushlog/view/{$version_number}/", ), $version_number); } else { $version_number = '-'; } if ($version && $version->getIsWriting()) { $is_writing = id(new PHUIIconView()) ->setIcon('fa-pencil green'); } else { $is_writing = id(new PHUIIconView()) ->setIcon('fa-pencil grey'); } + $write_properties = null; + if ($version) { + $write_properties = $version->getWriteProperties(); + if ($write_properties) { + try { + $write_properties = phutil_json_decode($write_properties); + } catch (Exception $ex) { + $write_properties = null; + } + } + } + + if ($write_properties) { + $writer_phid = idx($write_properties, 'userPHID'); + $last_writer = $viewer->renderHandle($writer_phid); + + $writer_epoch = idx($write_properties, 'epoch'); + $writer_epoch = phabricator_datetime($writer_epoch, $viewer); + } else { + $last_writer = null; + $writer_epoch = null; + } + $rows[] = array( $binding_icon, phutil_tag( 'a', array( 'href' => $device->getURI(), ), $device->getName()), $version_number, $is_writing, + $last_writer, + $writer_epoch, ); } } $table = id(new AphrontTableView($rows)) ->setNoDataString(pht('This is not a cluster repository.')) ->setHeaders( array( null, pht('Device'), pht('Version'), pht('Writing'), + pht('Last Writer'), + pht('Last Write At'), )) ->setColumnClasses( array( null, null, null, 'right wide', + null, + 'date', )); $doc_href = PhabricatorEnv::getDoclink('Cluster: Repositories'); $header = id(new PHUIHeaderView()) ->setHeader(pht('Cluster Status')) ->addActionLink( id(new PHUIButtonView()) ->setIcon('fa-book') ->setHref($doc_href) ->setTag('a') ->setText(pht('Documentation'))); if ($service) { $header->setSubheader( pht( 'This repository is hosted on %s.', phutil_tag( 'a', array( 'href' => $service->getURI(), ), $service->getName()))); } return id(new PHUIObjectBoxView()) ->setHeader($header) ->setBackground(PHUIObjectBoxView::BLUE_PROPERTY) ->setTable($table); } } diff --git a/src/applications/repository/storage/PhabricatorRepositoryWorkingCopyVersion.php b/src/applications/repository/storage/PhabricatorRepositoryWorkingCopyVersion.php index 888301e807..c8ad477a3d 100644 --- a/src/applications/repository/storage/PhabricatorRepositoryWorkingCopyVersion.php +++ b/src/applications/repository/storage/PhabricatorRepositoryWorkingCopyVersion.php @@ -1,156 +1,155 @@ false, self::CONFIG_COLUMN_SCHEMA => array( 'repositoryVersion' => 'uint32', 'isWriting' => 'bool', 'writeProperties' => 'text?', ), self::CONFIG_KEY_SCHEMA => array( 'key_workingcopy' => array( 'columns' => array('repositoryPHID', 'devicePHID'), 'unique' => true, ), ), ) + parent::getConfiguration(); } public static function loadVersions($repository_phid) { $version = new self(); $conn_w = $version->establishConnection('w'); $table = $version->getTableName(); // This is a normal read, but force it to come from the master. $rows = queryfx_all( $conn_w, 'SELECT * FROM %T WHERE repositoryPHID = %s', $table, $repository_phid); return $version->loadAllFromArray($rows); } public static function getReadLock($repository_phid, $device_phid) { $repository_hash = PhabricatorHash::digestForIndex($repository_phid); $device_hash = PhabricatorHash::digestForIndex($device_phid); $lock_key = "repo.read({$repository_hash}, {$device_hash})"; return PhabricatorGlobalLock::newLock($lock_key); } public static function getWriteLock($repository_phid) { $repository_hash = PhabricatorHash::digestForIndex($repository_phid); $lock_key = "repo.write({$repository_hash})"; return PhabricatorGlobalLock::newLock($lock_key); } /** * Before a write, set the "isWriting" flag. * * This allows us to detect when we lose a node partway through a write and * may have committed and acknowledged a write on a node that lost the lock * partway through the write and is no longer reachable. * * In particular, if a node loses its connection to the datbase the global * lock is released by default. This is a durable lock which stays locked * by default. */ public static function willWrite( $repository_phid, $device_phid, array $write_properties) { $version = new self(); $conn_w = $version->establishConnection('w'); $table = $version->getTableName(); queryfx( $conn_w, 'INSERT INTO %T (repositoryPHID, devicePHID, repositoryVersion, isWriting, writeProperties) VALUES (%s, %s, %d, %d, %s) ON DUPLICATE KEY UPDATE isWriting = VALUES(isWriting), writeProperties = VALUES(writeProperties)', $table, $repository_phid, $device_phid, 0, 1, phutil_json_encode($write_properties)); } /** * After a write, update the version and release the "isWriting" lock. */ public static function didWrite( $repository_phid, $device_phid, $old_version, $new_version) { $version = new self(); $conn_w = $version->establishConnection('w'); $table = $version->getTableName(); queryfx( $conn_w, 'UPDATE %T SET repositoryVersion = %d, - isWriting = 0, - writeProperties = null + isWriting = 0 WHERE repositoryPHID = %s AND devicePHID = %s AND repositoryVersion = %d AND isWriting = 1', $table, $new_version, $repository_phid, $device_phid, $old_version); } /** * After a fetch, set the local version to the fetched version. */ public static function updateVersion( $repository_phid, $device_phid, $new_version) { $version = new self(); $conn_w = $version->establishConnection('w'); $table = $version->getTableName(); queryfx( $conn_w, 'INSERT INTO %T (repositoryPHID, devicePHID, repositoryVersion, isWriting) VALUES (%s, %s, %d, %d) ON DUPLICATE KEY UPDATE repositoryVersion = VALUES(repositoryVersion)', $table, $repository_phid, $device_phid, $new_version, 0); } } diff --git a/src/docs/user/cluster/cluster_repositories.diviner b/src/docs/user/cluster/cluster_repositories.diviner index eb9a4f4ede..fc35b3b619 100644 --- a/src/docs/user/cluster/cluster_repositories.diviner +++ b/src/docs/user/cluster/cluster_repositories.diviner @@ -1,198 +1,275 @@ @title Cluster: Repositories @group intro Configuring Phabricator to use multiple repository hosts. Overview ======== WARNING: This feature is a very early prototype; the features this document describes are mostly speculative fantasy. If you use Git or Mercurial, you can deploy Phabricator with multiple repository hosts, configured so that each host is readable and writable. The advantages of doing this are: - you can completely survive the loss of repository hosts; - reads and writes can scale across multiple machines; and - read and write performance across multiple geographic regions may improve. This configuration is complex, and many installs do not need to pursue it. This configuration is not currently supported with Subversion or Mercurial. Repository Hosts ================ Repository hosts must run a complete, fully configured copy of Phabricator, including a webserver. They must also run a properly configured `sshd`. Generally, these hosts will run the same set of services and configuration that web hosts run. If you prefer, you can overlay these services and put web and repository services on the same hosts. See @{article:Clustering Introduction} for some guidance on overlaying services. When a user requests information about a repository that can only be satisfied by examining a repository working copy, the webserver receiving the request will make an HTTP service call to a repository server which hosts the repository to retrieve the data it needs. It will use the result of this query to respond to the user. How Reads and Writes Work ========================= Phabricator repository replicas are multi-master: every node is readable and writable, and a cluster of nodes can (almost always) survive the loss of any arbitrary subset of nodes so long as at least one node is still alive. Phabricator maintains an internal version for each repository, and increments it when the repository is mutated. Before responding to a read, replicas make sure their version of the repository is up to date (no node in the cluster has a newer version of the repository). If it isn't, they block the read until they can complete a fetch. Before responding to a write, replicas obtain a global lock, perform the same version check and fetch if necessary, then allow the write to continue. Additionally, repositories passively check other nodes for updates and replicate changes in the background. After you push a change to a repositroy, it will usually spread passively to all other repository nodes within a few minutes. Even if passive replication is slow, the active replication makes acknowledged changes sequential to all observers: after a write is acknowledged, all subsequent reads are guaranteed to see it. The system does not permit stale reads, and you do not need to wait for a replication delay to see a consistent view of the repository no matter which node you ask. HTTP vs HTTPS ============= Intracluster requests (from the daemons to repository servers, or from webservers to repository servers) are permitted to use HTTP, even if you have set `security.require-https` in your configuration. It is common to terminate SSL at a load balancer and use plain HTTP beyond that, and the `security.require-https` feature is primarily focused on making client browser behavior more convenient for users, so it does not apply to intracluster traffic. Using HTTP within the cluster leaves you vulnerable to attackers who can observe traffic within a datacenter, or observe traffic between datacenters. This is normally very difficult, but within reach for state-level adversaries like the NSA. If you are concerned about these attackers, you can terminate HTTPS on repository hosts and bind to them with the "https" protocol. Just be aware that the `security.require-https` setting won't prevent you from making configuration mistakes, as it doesn't cover intracluster traffic. Other mitigations are possible, but securing a network against the NSA and similar agents of other rogue nations is beyond the scope of this document. Monitoring Replication ====================== You can review the current status of a repository on cluster nodes in {nav Diffusion > (Repository) > Manage Repository > Cluster Configuration}. This screen shows all the configured devices which are hosting the repository and the available version. **Version**: When a repository is mutated by a push, Phabricator increases an internal version number for the repository. This column shows which version is on disk on the corresponding node. After a change is pushed, the node which received the change will have a larger version number than the other nodes. The change should be passively replicated to the remaining nodes after a brief period of time, although this can take a while if the change was large or the network connection between nodes is slow or unreliable. You can click the version number to see the corresponding push logs for that change. The logs contain details about what was changed, and can help you identify if replication is slow because a change is large or for some other reason. **Writing**: This shows that the node is currently holding a write lock. This normally means that it is actively receiving a push, but can also mean that there was a write interruption. See "Write Interruptions" below for details. +**Last Writer**: This column identifies the user who most recently pushed a +change to this device. If the write lock is currently held, this user is +the user whose change is holding the lock. + +**Last Write At**: When the most recent write started. If the write lock is +currently held, this shows when the lock was acquired. + Write Interruptions =================== A repository cluster can be put into an inconsistent state by an interruption -in a brief window immediately after a write. +in a brief window during and immediately after a write. Phabricator can not commit changes to a working copy (stored on disk) and to the global state (stored in a database) atomically, so there is a narrow window between committing these two different states when some tragedy (like a lightning strike) can befall a server, leaving the global and local views of -the repository state divergent. +the repository state possibly divergent. -In these cases, Phabricator fails into a "frozen" state where further writes +In these cases, Phabricator fails into a frozen state where further writes are not permitted until the failure is investigated and resolved. -TODO: Complete the support tooling and provide recovery instructions. +You can use the monitoring console to review the state of a frozen repository +with a held write lock. The **Writing** column will show which node is holding +the lock, and whoever is named in the **Last Writer** column may be able to +help you figure out what happened by providing more information about what they +were doing and what they observed. + +Because the push was not acknowledged, it is normally safe to demote the node: +the user should have received an error anyway, and should not expect their push +to have worked. However, data is technically at risk and you may want to +investigate further and try to understand the issue in more detail before +continuing. + +There is no way to explicitly keep the write, but if it was committed to disk +you can recover it manually from the working copy on the device and then push +it again. + +If you demote the node, the in-process write will be thrown away, even if it +was complete on disk. To demote the node and release the write lock, run this +command: + +``` +phabricator/ $ ./bin/repository thaw rXYZ --demote repo002.corp.net +``` + +{icon exclamation-triangle, color="yellow"} Any committed but unacknowledged +data on the device will be lost. Loss of Leaders =============== A more straightforward failure condition is the loss of all servers in a cluster which have the most up-to-date copy of a repository. This looks like this: - There is a cluster setup with two nodes, X and Y. - A new change is pushed to server X. - Before the change can propagate to server Y, lightning strikes server X and destroys it. Here, all of the "leader" nodes with the most up-to-date copy of the repository have been lost. Phabricator will refuse to serve this repository because it can not serve it consistently, and can not accept writes without data loss. The most straightforward way to resolve this issue is to restore any leader to service. The change will be able to replicate to other nodes once a leader comes back online. If you are unable to restore a leader or unsure that you can restore one quickly, you can use the monitoring console to review which changes are present on the leaders but not present on the followers by examining the push logs. -TODO: Complete the support tooling and provide recovery instructions. +If you are comfortable discarding these changes, you can instruct Phabricator +that it can forget about the leaders in two ways: disable the service bindings +to all of the leader nodes so they are no longer part of the cluster, or +use `bin/repository thaw` to `--demote` the leaders explicitly. + +If you do this, **you will lose data**. Either action will discard any changes +on the affected leaders which have not replicated to other nodes in the cluster. + +To demote a device, run this command: + +``` +phabricator/ $ ./bin/repository thaw rXYZ --demote repo002.corp.net +``` + +{icon exclamation-triangle, color="red"} Any data which is only present on +**this** device will be lost. + + +Ambiguous Leaders +================= + +Repository clusters can also freeze if the leader nodes are ambiguous. This +can happen if you replace an entire cluster with new devices suddenly, or +make a mistake with the `--demote` flag. + +When Phabricator can not tell which node in a cluster is a leader, it freezes +the cluster because it is possible that some nodes have less data and others +have more, and if it choses a leader arbitrarily it may destroy some data +which you would prefer to retain. + +To resolve this, you need to tell Phabricator which node has the most +up-to-date data and promote that node to become a leader. If you do this, +**you may lose data** if you promote the wrong node, and some other node +really had more up-to-date data. If you want to double check, you can examine +the working copies on disk before promoting, by connecting to the machines and +using commands like `git log` to inspect state. + +Once you have identified a node which has data you're happy with, use +`bin/repository thaw` to `--promote` the device: + +``` +phabricator/ $ ./bin/repository thaw rXYZ --promote repo002.corp.net +``` + +{icon exclamation-triangle, color="red"} Any data which is only present on +**other** devices will be lost. Backups ====== Even if you configure clustering, you should still consider retaining separate backup snapshots. Replicas protect you from data loss if you lose a host, but they do not let you rewind time to recover from data mutation mistakes. If something issues a `--force` push that destroys branch heads, the mutation will propagate to the replicas. You may be able to manually restore the branches by using tools like the Phabricator push log or the Git reflog so it is less important to retain repository snapshots than database snapshots, but it is still possible for data to be lost permanently, especially if you don't notice the problem for some time. Retaining separate backup snapshots will improve your ability to recover more data more easily in a wider range of disaster situations. Next Steps ========== Continue by: - returning to @{article:Clustering Introduction}.