diff --git a/src/applications/config/controller/PhabricatorConfigClusterDatabasesController.php b/src/applications/config/controller/PhabricatorConfigClusterDatabasesController.php --- a/src/applications/config/controller/PhabricatorConfigClusterDatabasesController.php +++ b/src/applications/config/controller/PhabricatorConfigClusterDatabasesController.php @@ -35,6 +35,8 @@ $rows = array(); foreach ($databases as $database) { + $messages = array(); + if ($database->getIsMaster()) { $role_icon = id(new PHUIIconView()) ->setIcon('fa-database sky') @@ -125,6 +127,9 @@ } else { $health_icon = id(new PHUIIconView()) ->setIcon('fa-times red'); + $messages[] = pht( + 'UNHEALTHY: This database has failed recent health checks. Traffic '. + 'will not be sent to it until it recovers.'); } $health_count = pht( @@ -138,8 +143,6 @@ $health_count, ); - $messages = array(); - $conn_message = $database->getConnectionMessage(); if ($conn_message) { $messages[] = $conn_message; diff --git a/src/docs/user/cluster/cluster_databases.diviner b/src/docs/user/cluster/cluster_databases.diviner --- a/src/docs/user/cluster/cluster_databases.diviner +++ b/src/docs/user/cluster/cluster_databases.diviner @@ -22,6 +22,10 @@ Phabricator can not currently be configured into a multi-master mode, nor can it be configured to automatically promote a replica to become the new master. +If you lose the master, Phabricator can degrade automatically into read-only +mode and remain available, but can not fully recover without operational +intervention unless the master recovers on its own. + Setting up MySQL Replication ============================ @@ -59,17 +63,109 @@ `mysql.pass`) are used only to provide defaults. Once you've configured this option, restart Phabricator for the changes to take -effect, then continue to "Monitoring and Testing" to verify the configuration. +effect, then continue to "Monitoring Replicas" to verify the configuration. -Monitoring and Testing -====================== +Monitoring Replicas +=================== You can monitor replicas in {nav Config > Cluster Databases}. This interface shows you a quick overview of replicas and their health, and can detect some common issues with replication. -TODO: Write more stuff here. +The table on this page shows each database and current status. + +NOTE: This page runs its diagnostics //from the web server that is serving the +request//. If you are recovering from a disaster, the view this page shows +may be partial or misleading, and two requests served by different servers may +see different views of the cluster. + +**Connection**: Phabricator tries to connect to each configured database, then +shows the result in this column. If it fails, a brief diagnostic message with +details about the error is shown. If it succeeds, the column shows a rough +measurement of latency from the current webserver to the database. + +**Replication**: This is a summary of replication status on the database. If +things are properly configured and stable, the replicas should be actively +replicating and no more than a few seconds behind master, and the master +should //not// be replicating from another database. + +To report this status, the user Phabricator is connecting as must have the +`REPLICATION CLIENT` privilege (or the `SUPER` privilege) so it can run the +`SHOW SLAVE STATUS` command. The `REPLICATION CLIENT` privilege only enables +the user to run diagnostic commands so it should be reasonable to grant it in +most cases, but it is not required. If you choose not to grant it, this page +can not show any useful diagnostic information about replication status but +everything else will still work. + +If a replica is more than a second behind master, this page will show the +current replication delay. If the replication delay is more than 30 seconds, +it will report "Slow Replication" with a warning icon. + +If replication is delayed, data is at risk: if you lose the master and can not +later recover it (for example, because a meteor has obliterated the datacenter +housing the physical host), data which did not make it to the replica will be +lost forever. + +Beyond the risk of data loss, any read-only traffic sent to the replica will +see an older view of the world which could be confusing for users: it may +appear that their data has been lost, even if it is safe and just hasn't +replicated yet. + +Phabricator will attempt to prevent clients from seeing out-of-date views, but +sometimes sending traffic to a delayed replica is the best available option +(for example, if the master can not be reached). + +**Health**: This column shows the result of recent health checks against the +server. After several checks in a row fail, Phabricator will mark the server +as unhealthy and stop sending traffic to it until several checks in a row +later succeed. + +Note that each web server tracks database health independently, so if you have +several servers they may have different views of database health. This is +normal and not problematic. + +For more information on health checks, see "Unreachable Masters" below. + +**Messages**: This column has additional details about any errors shown in the +other columns. These messages can help you understand or resolve problems. + + +Testing Replicas +================ + +To test that your configuration can survive a disaster, turn off the master +database. Do this with great ceremony, making a cool explosion sound as you +run the `mysqld stop` command. + +If things have been set up properly, Phabricator should degrade to a temporary +read-only mode immediately. After a brief period of unresponsiveness, it will +degrade further into a longer-term read-only mode. For details on how this +works interanlly, see "Unreachable Masters" below. + +Once satisfied, turn the master back on. After a brief delay, Phabricator +should recognize that the master is healthy again and recover fully. + +Throughout this process, the {nav Cluster Databases} console will show a +current view of the world from the perspective of the web server handling the +request. You can use it to monitor state. + +You can perform a more narrow test by enabling `cluster.read-only` in +configuration. This will put Phabricator into read-only mode immediately +without turning off any databases. + +You can use this mode to understand which capabilities will and will not be +available in read-only mode, and make sure any information you want to remain +accessible in a disaster (like wiki pages or contact information) is really +accessible. + +See the next section, "Degradation to Read Only Mode", for more details about +when, why, and how Phabricator degrades. + +If you run custom code or extensions, they may not accommodate read-only mode +properly. You should specifically test that they function correctly in +read-only mode and do not prevent you from accessing important information. + Degradation to Read-Only Mode ============================= @@ -78,8 +174,8 @@ - you turn it on explicitly; - you configure cluster mode, but don't set up any masters; - - the master is misconfigured and unsafe to write to; or - - the master is unreachable. + - the master can not be reached while handling a request; or + - recent attempts to connect to the master have consistently failed. When Phabricator is running in read-only mode, users can still read data and browse and clone repositories, but they can not edit, update, or push new @@ -99,20 +195,9 @@ be more convenient than turning it on explicitly during the course of operations work. -Before writing to a master, Phabricator will verify that the host is not -configured as a replica. This is a safety feature to prevent data loss if your -MySQL and Phabricator configurations disagree about replica configuration. If -your `master` is currently replicating from another host, Phabricator will -treat it as a `replica` instead and implicitly degrade into read-only mode. - -Finally, if Phabricator is unable to reach the master, it will degrade into -read-only mode. For details on how Phabricator determines that a master is -unreachable, see "Unreachable Masters" below. - -If a master becomes unreachable, this normally corresponds to loss of the -master host, a severed network link, or some other sort of disaster. -Phabricator will degrade and continue operating in read-only mode until the -master recovers or operations personnel can assess the situation and intervene. +If Phabricator is unable to reach the master database, it will degrade into +read-only mode automatically. See "Unreachable Masters" below for details on +how this process works. If you end up in a situation where you have lost the master and can not get it back online (or can not restore it quickly) you can promote a replica to become @@ -122,7 +207,7 @@ Promoting a Replica =================== -TODO: Write this, too. +TODO: Write this section. Unreachable Masters @@ -131,7 +216,67 @@ This section describes how Phabricator determines that a master has been lost, marks it unreachable, and degrades into read-only mode. -TODO: For now, it doesn't. +Phabricator degrades into read-only mode automatically in two ways: very +briefly in response to a single connection failure, or more permanently in +response to a series of connection failures. + +In the first case, if a request needs to connect to the master but is not able +to, Phabricator will temporarily degrade into read-only mode for the remainder +of that request. The alternative is to fail abruptly, but Phabricator can +sometimes degrade successfully and still respond to the user's request, so it +makes an effort to finish serving the request from replicas. + +If the request was a write (like posting a comment) it will fail anyway, but +if it was a read that did not actually need to use the master it may succeed. + +This temporary mode is intended to recover as gracefully as possible from brief +interruptions in service (a few seconds), like a server being restarted, a +network link becoming temporarily unavailable, or brief periods of load-related +disruption. If the anomaly is temporary, Phabricator should recover immediately +(on the next request once service is restored). + +This mode can be slow for users (they need to wait on connection attempts to +the master which fail) and does not reduce load on the master (requests still +attempt to connect to it). + +The second way Phabricator degrades is by running periodic health checks +against databases, and marking them unhealthy if they fail over a longer period +of time. This mechanism is very similar to the health checks that most HTTP +load balancers perform against web servers. + +If a database fails several health checks in a row, Phabricator will mark it as +unhealthy and stop sending all traffic (except for more health checks) to it. +This improves performance during a service interruption and reduces load on the +master, which may help it recover from load problems. + +You can monitor the status of health checks in the {nav Cluster Databases} +console. The "Health" column shows how many checks have run recently and +how many have succeeded. + +Health checks run every 3 seconds, and 5 checks in a row must fail or succeed +before Phabricator marks the database as healthy or unhealthy, so it will +generally take about 15 seconds for a database to change state after it goes +down or comes up. + +If all of the recent checks fail, Phabricator will mark the database as +unhealthy and stop sending traffic to it. If the master was the database that +was marked as unhealthy, Phabricator will actively degrade into read-only mode +until it recovers. + +This mode only attempts to connect to the unhealthy database once every few +seconds to see if it is recovering, so performance will be better on average +(users rarely need to wait for bad connections to fail or time out) and the +datbase will receive less load. + +Once all of the recent checks succeed, Phabricator will mark the database as +healthy again and continue sending traffic to it. + +Health checks are tracked individually for each web server, so some web servers +may see a host as healthy while others see it as unhealthy. This is normal, and +can accurately reflect the state of the world: for example, the link between +datacenters may have been lost, so hosts in one datacenter can no longer see +the master, while hosts in the other datacenter still have a healthy link to +it. Backups diff --git a/src/infrastructure/cluster/PhabricatorDatabaseHealthRecord.php b/src/infrastructure/cluster/PhabricatorDatabaseHealthRecord.php --- a/src/infrastructure/cluster/PhabricatorDatabaseHealthRecord.php +++ b/src/infrastructure/cluster/PhabricatorDatabaseHealthRecord.php @@ -52,6 +52,7 @@ * the state. */ public function getRequiredEventCount() { + // NOTE: If you change this value, update the "Cluster: Databases" docs. return 5; } @@ -60,6 +61,7 @@ * Seconds to wait between health checks. */ public function getHealthCheckFrequency() { + // NOTE: If you change this value, update the "Cluster: Databases" docs. return 3; } diff --git a/src/infrastructure/cluster/PhabricatorDatabaseRef.php b/src/infrastructure/cluster/PhabricatorDatabaseRef.php --- a/src/infrastructure/cluster/PhabricatorDatabaseRef.php +++ b/src/infrastructure/cluster/PhabricatorDatabaseRef.php @@ -14,6 +14,7 @@ const REPLICATION_SLOW = 'replica-slow'; const KEY_REFS = 'cluster.db.refs'; + const KEY_INDIVIDUAL = 'cluster.db.individual'; private $host; private $port; @@ -21,6 +22,7 @@ private $pass; private $disabled; private $isMaster; + private $isIndividual; private $connectionLatency; private $connectionStatus; @@ -145,6 +147,15 @@ return $this->replicaDelay; } + public function setIsIndividual($is_individual) { + $this->isIndividual = $is_individual; + return $this; + } + + public function getIsIndividual() { + return $this->isIndividual; + } + public static function getConnectionStatusMap() { return array( self::STATUS_OKAY => array( @@ -207,6 +218,18 @@ return $refs; } + public static function getLiveIndividualRef() { + $cache = PhabricatorCaches::getRequestCache(); + + $ref = $cache->getKey(self::KEY_INDIVIDUAL); + if (!$ref) { + $ref = self::newIndividualRef(); + $cache->setKey(self::KEY_INDIVIDUAL, $ref); + } + + return $ref; + } + public static function newRefs() { $refs = array(); @@ -339,6 +362,14 @@ } public function isSevered() { + // If we only have an individual database, never sever our connection to + // it, at least for now. It's possible that using the same severing rules + // might eventually make sense to help alleviate load-related failures, + // but we should wait for all the cluster stuff to stabilize first. + if ($this->getIsIndividual()) { + return false; + } + if ($this->didFailToConnect) { return true; } @@ -402,16 +433,7 @@ $refs = self::getLiveRefs(); if (!$refs) { - $conf = PhabricatorEnv::newObjectFromConfig( - 'mysql.configuration-provider', - array(null, 'w', null)); - - return id(new self()) - ->setHost($conf->getHost()) - ->setPort($conf->getPort()) - ->setUser($conf->getUser()) - ->setPass($conf->getPassword()) - ->setIsMaster(true); + return self::getLiveIndividualRef(); } $master = null; @@ -427,6 +449,20 @@ return null; } + public static function newIndividualRef() { + $conf = PhabricatorEnv::newObjectFromConfig( + 'mysql.configuration-provider', + array(null, 'w', null)); + + return id(new self()) + ->setHost($conf->getHost()) + ->setPort($conf->getPort()) + ->setUser($conf->getUser()) + ->setPass($conf->getPassword()) + ->setIsIndividual(true) + ->setIsMaster(true); + } + public static function getReplicaDatabaseRef() { $refs = self::getLiveRefs();