diff --git a/src/applications/config/controller/PhabricatorConfigClusterDatabasesController.php b/src/applications/config/controller/PhabricatorConfigClusterDatabasesController.php index b498030d23..1b3da10cae 100644 --- a/src/applications/config/controller/PhabricatorConfigClusterDatabasesController.php +++ b/src/applications/config/controller/PhabricatorConfigClusterDatabasesController.php @@ -1,210 +1,213 @@ buildSideNavView(); $nav->selectFilter('cluster/databases/'); $title = pht('Cluster Databases'); $crumbs = $this ->buildApplicationCrumbs($nav) ->addTextCrumb(pht('Cluster Databases')); $database_status = $this->buildClusterDatabaseStatus(); $view = id(new PHUITwoColumnView()) ->setNavigation($nav) ->setMainColumn($database_status); return $this->newPage() ->setTitle($title) ->setCrumbs($crumbs) ->appendChild($view); } private function buildClusterDatabaseStatus() { $viewer = $this->getViewer(); $databases = PhabricatorDatabaseRef::queryAll(); $connection_map = PhabricatorDatabaseRef::getConnectionStatusMap(); $replica_map = PhabricatorDatabaseRef::getReplicaStatusMap(); Javelin::initBehavior('phabricator-tooltips'); $rows = array(); foreach ($databases as $database) { + $messages = array(); + if ($database->getIsMaster()) { $role_icon = id(new PHUIIconView()) ->setIcon('fa-database sky') ->addSigil('has-tooltip') ->setMetadata( array( 'tip' => pht('Master'), )); } else { $role_icon = id(new PHUIIconView()) ->setIcon('fa-download') ->addSigil('has-tooltip') ->setMetadata( array( 'tip' => pht('Replica'), )); } if ($database->getDisabled()) { $conn_icon = 'fa-times'; $conn_color = 'grey'; $conn_label = pht('Disabled'); } else { $status = $database->getConnectionStatus(); $info = idx($connection_map, $status, array()); $conn_icon = idx($info, 'icon'); $conn_color = idx($info, 'color'); $conn_label = idx($info, 'label'); if ($status === PhabricatorDatabaseRef::STATUS_OKAY) { $latency = $database->getConnectionLatency(); $latency = (int)(1000000 * $latency); $conn_label = pht('%s us', new PhutilNumber($latency)); } } $connection = array( id(new PHUIIconView())->setIcon("{$conn_icon} {$conn_color}"), ' ', $conn_label, ); if ($database->getDisabled()) { $replica_icon = 'fa-times'; $replica_color = 'grey'; $replica_label = pht('Disabled'); } else { $status = $database->getReplicaStatus(); $info = idx($replica_map, $status, array()); $replica_icon = idx($info, 'icon'); $replica_color = idx($info, 'color'); $replica_label = idx($info, 'label'); if ($database->getIsMaster()) { if ($status === PhabricatorDatabaseRef::REPLICATION_OKAY) { $replica_icon = 'fa-database'; } } else { switch ($status) { case PhabricatorDatabaseRef::REPLICATION_OKAY: case PhabricatorDatabaseRef::REPLICATION_SLOW: $delay = $database->getReplicaDelay(); if ($delay) { $replica_label = pht('%ss Behind', new PhutilNumber($delay)); } else { $replica_label = pht('Up to Date'); } break; } } } $replication = array( id(new PHUIIconView())->setIcon("{$replica_icon} {$replica_color}"), ' ', $replica_label, ); $health = $database->getHealthRecord(); $health_up = $health->getUpEventCount(); $health_down = $health->getDownEventCount(); if ($health->getIsHealthy()) { $health_icon = id(new PHUIIconView()) ->setIcon('fa-plus green'); } else { $health_icon = id(new PHUIIconView()) ->setIcon('fa-times red'); + $messages[] = pht( + 'UNHEALTHY: This database has failed recent health checks. Traffic '. + 'will not be sent to it until it recovers.'); } $health_count = pht( '%s / %s', new PhutilNumber($health_up), new PhutilNumber($health_up + $health_down)); $health_status = array( $health_icon, ' ', $health_count, ); - $messages = array(); - $conn_message = $database->getConnectionMessage(); if ($conn_message) { $messages[] = $conn_message; } $replica_message = $database->getReplicaMessage(); if ($replica_message) { $messages[] = $replica_message; } $messages = phutil_implode_html(phutil_tag('br'), $messages); $rows[] = array( $role_icon, $database->getHost(), $database->getPort(), $database->getUser(), $connection, $replication, $health_status, $messages, ); } $table = id(new AphrontTableView($rows)) ->setNoDataString( pht('Phabricator is not configured in cluster mode.')) ->setHeaders( array( null, pht('Host'), pht('Port'), pht('User'), pht('Connection'), pht('Replication'), pht('Health'), pht('Messages'), )) ->setColumnClasses( array( null, null, null, null, null, null, null, 'wide', )); $doc_href = PhabricatorEnv::getDoclink('Cluster: Databases'); $header = id(new PHUIHeaderView()) ->setHeader(pht('Cluster Database Status')) ->addActionLink( id(new PHUIButtonView()) ->setIcon('fa-book') ->setHref($doc_href) ->setTag('a') ->setText(pht('Database Clustering Documentation'))); return id(new PHUIObjectBoxView()) ->setHeader($header) ->setTable($table); } } diff --git a/src/docs/user/cluster/cluster_databases.diviner b/src/docs/user/cluster/cluster_databases.diviner index 96ee3ad44c..653b521ee0 100644 --- a/src/docs/user/cluster/cluster_databases.diviner +++ b/src/docs/user/cluster/cluster_databases.diviner @@ -1,165 +1,310 @@ @title Cluster: Databases @group intro Configuring Phabricator to use multiple database hosts. Overview ======== WARNING: This feature is a very early prototype; the features this document describes are mostly speculative fantasy. You can deploy Phabricator with multiple database hosts, configured as a master and a set of replicas. The advantages of doing this are: - faster recovery from disasters by promoting a replica; - graceful degradation if the master fails; - reduced load on the master; and - some tools to help monitor and manage replica health. This configuration is complex, and many installs do not need to pursue it. Phabricator can not currently be configured into a multi-master mode, nor can it be configured to automatically promote a replica to become the new master. +If you lose the master, Phabricator can degrade automatically into read-only +mode and remain available, but can not fully recover without operational +intervention unless the master recovers on its own. + Setting up MySQL Replication ============================ TODO: Write this section. Configuring Replicas ==================== Once your replicas are in working order, tell Phabricator about them by configuring the `cluster.database` option. This option must be configured from the command line or in configuration files because Phabricator needs to read it //before// it can connect to databases. This option value will list all of the database hosts that you want Phabricator to interact with: your master and all your replicas. Each entry in the list should have these keys: - `host`: //Required string.// The database host name. - `role`: //Required string.// The cluster role of this host, one of `master` or `replica`. - `port`: //Optional int.// The port to connect to. If omitted, the default port from `mysql.port` will be used. - `user`: //Optional string.// The MySQL username to use to connect to this host. If omitted, the default from `mysql.user` will be used. - `pass`: //Optional string.// The password to use to connect to this host. If omitted, the default from `mysql.pass` will be used. - `disabled`: //Optional bool.// If set to `true`, Phabricator will not connect to this host. You can use this to temporarily take a host out of service. When `cluster.databases` is configured the `mysql.host` option is not used. The other MySQL connection configuration options (`mysql.port`, `mysql.user`, `mysql.pass`) are used only to provide defaults. Once you've configured this option, restart Phabricator for the changes to take -effect, then continue to "Monitoring and Testing" to verify the configuration. +effect, then continue to "Monitoring Replicas" to verify the configuration. -Monitoring and Testing -====================== +Monitoring Replicas +=================== You can monitor replicas in {nav Config > Cluster Databases}. This interface shows you a quick overview of replicas and their health, and can detect some common issues with replication. -TODO: Write more stuff here. +The table on this page shows each database and current status. + +NOTE: This page runs its diagnostics //from the web server that is serving the +request//. If you are recovering from a disaster, the view this page shows +may be partial or misleading, and two requests served by different servers may +see different views of the cluster. + +**Connection**: Phabricator tries to connect to each configured database, then +shows the result in this column. If it fails, a brief diagnostic message with +details about the error is shown. If it succeeds, the column shows a rough +measurement of latency from the current webserver to the database. + +**Replication**: This is a summary of replication status on the database. If +things are properly configured and stable, the replicas should be actively +replicating and no more than a few seconds behind master, and the master +should //not// be replicating from another database. + +To report this status, the user Phabricator is connecting as must have the +`REPLICATION CLIENT` privilege (or the `SUPER` privilege) so it can run the +`SHOW SLAVE STATUS` command. The `REPLICATION CLIENT` privilege only enables +the user to run diagnostic commands so it should be reasonable to grant it in +most cases, but it is not required. If you choose not to grant it, this page +can not show any useful diagnostic information about replication status but +everything else will still work. + +If a replica is more than a second behind master, this page will show the +current replication delay. If the replication delay is more than 30 seconds, +it will report "Slow Replication" with a warning icon. + +If replication is delayed, data is at risk: if you lose the master and can not +later recover it (for example, because a meteor has obliterated the datacenter +housing the physical host), data which did not make it to the replica will be +lost forever. + +Beyond the risk of data loss, any read-only traffic sent to the replica will +see an older view of the world which could be confusing for users: it may +appear that their data has been lost, even if it is safe and just hasn't +replicated yet. + +Phabricator will attempt to prevent clients from seeing out-of-date views, but +sometimes sending traffic to a delayed replica is the best available option +(for example, if the master can not be reached). + +**Health**: This column shows the result of recent health checks against the +server. After several checks in a row fail, Phabricator will mark the server +as unhealthy and stop sending traffic to it until several checks in a row +later succeed. + +Note that each web server tracks database health independently, so if you have +several servers they may have different views of database health. This is +normal and not problematic. + +For more information on health checks, see "Unreachable Masters" below. + +**Messages**: This column has additional details about any errors shown in the +other columns. These messages can help you understand or resolve problems. + + +Testing Replicas +================ + +To test that your configuration can survive a disaster, turn off the master +database. Do this with great ceremony, making a cool explosion sound as you +run the `mysqld stop` command. + +If things have been set up properly, Phabricator should degrade to a temporary +read-only mode immediately. After a brief period of unresponsiveness, it will +degrade further into a longer-term read-only mode. For details on how this +works interanlly, see "Unreachable Masters" below. + +Once satisfied, turn the master back on. After a brief delay, Phabricator +should recognize that the master is healthy again and recover fully. + +Throughout this process, the {nav Cluster Databases} console will show a +current view of the world from the perspective of the web server handling the +request. You can use it to monitor state. + +You can perform a more narrow test by enabling `cluster.read-only` in +configuration. This will put Phabricator into read-only mode immediately +without turning off any databases. + +You can use this mode to understand which capabilities will and will not be +available in read-only mode, and make sure any information you want to remain +accessible in a disaster (like wiki pages or contact information) is really +accessible. + +See the next section, "Degradation to Read Only Mode", for more details about +when, why, and how Phabricator degrades. + +If you run custom code or extensions, they may not accommodate read-only mode +properly. You should specifically test that they function correctly in +read-only mode and do not prevent you from accessing important information. + Degradation to Read-Only Mode ============================= Phabricator will degrade to read-only mode when any of these conditions occur: - you turn it on explicitly; - you configure cluster mode, but don't set up any masters; - - the master is misconfigured and unsafe to write to; or - - the master is unreachable. + - the master can not be reached while handling a request; or + - recent attempts to connect to the master have consistently failed. When Phabricator is running in read-only mode, users can still read data and browse and clone repositories, but they can not edit, update, or push new changes. For example, users can still read disaster recovery information on the wiki or emergency contact information on user profiles. You can enable this mode explicitly by configuring `cluster.read-only`. Some reasons you might want to do this include: - to test that the mode works like you expect it to; - to make sure that information you need will be available; - to prevent new writes while performing database maintenance; or - to permanently archive a Phabricator install. You can also enable this mode implicitly by configuring `cluster.databases` but disabling the master, or by not specifying any host as a master. This may be more convenient than turning it on explicitly during the course of operations work. -Before writing to a master, Phabricator will verify that the host is not -configured as a replica. This is a safety feature to prevent data loss if your -MySQL and Phabricator configurations disagree about replica configuration. If -your `master` is currently replicating from another host, Phabricator will -treat it as a `replica` instead and implicitly degrade into read-only mode. - -Finally, if Phabricator is unable to reach the master, it will degrade into -read-only mode. For details on how Phabricator determines that a master is -unreachable, see "Unreachable Masters" below. - -If a master becomes unreachable, this normally corresponds to loss of the -master host, a severed network link, or some other sort of disaster. -Phabricator will degrade and continue operating in read-only mode until the -master recovers or operations personnel can assess the situation and intervene. +If Phabricator is unable to reach the master database, it will degrade into +read-only mode automatically. See "Unreachable Masters" below for details on +how this process works. If you end up in a situation where you have lost the master and can not get it back online (or can not restore it quickly) you can promote a replica to become the new master. See the next section, "Promoting a Replica", for details. Promoting a Replica =================== -TODO: Write this, too. +TODO: Write this section. Unreachable Masters =================== This section describes how Phabricator determines that a master has been lost, marks it unreachable, and degrades into read-only mode. -TODO: For now, it doesn't. +Phabricator degrades into read-only mode automatically in two ways: very +briefly in response to a single connection failure, or more permanently in +response to a series of connection failures. + +In the first case, if a request needs to connect to the master but is not able +to, Phabricator will temporarily degrade into read-only mode for the remainder +of that request. The alternative is to fail abruptly, but Phabricator can +sometimes degrade successfully and still respond to the user's request, so it +makes an effort to finish serving the request from replicas. + +If the request was a write (like posting a comment) it will fail anyway, but +if it was a read that did not actually need to use the master it may succeed. + +This temporary mode is intended to recover as gracefully as possible from brief +interruptions in service (a few seconds), like a server being restarted, a +network link becoming temporarily unavailable, or brief periods of load-related +disruption. If the anomaly is temporary, Phabricator should recover immediately +(on the next request once service is restored). + +This mode can be slow for users (they need to wait on connection attempts to +the master which fail) and does not reduce load on the master (requests still +attempt to connect to it). + +The second way Phabricator degrades is by running periodic health checks +against databases, and marking them unhealthy if they fail over a longer period +of time. This mechanism is very similar to the health checks that most HTTP +load balancers perform against web servers. + +If a database fails several health checks in a row, Phabricator will mark it as +unhealthy and stop sending all traffic (except for more health checks) to it. +This improves performance during a service interruption and reduces load on the +master, which may help it recover from load problems. + +You can monitor the status of health checks in the {nav Cluster Databases} +console. The "Health" column shows how many checks have run recently and +how many have succeeded. + +Health checks run every 3 seconds, and 5 checks in a row must fail or succeed +before Phabricator marks the database as healthy or unhealthy, so it will +generally take about 15 seconds for a database to change state after it goes +down or comes up. + +If all of the recent checks fail, Phabricator will mark the database as +unhealthy and stop sending traffic to it. If the master was the database that +was marked as unhealthy, Phabricator will actively degrade into read-only mode +until it recovers. + +This mode only attempts to connect to the unhealthy database once every few +seconds to see if it is recovering, so performance will be better on average +(users rarely need to wait for bad connections to fail or time out) and the +datbase will receive less load. + +Once all of the recent checks succeed, Phabricator will mark the database as +healthy again and continue sending traffic to it. + +Health checks are tracked individually for each web server, so some web servers +may see a host as healthy while others see it as unhealthy. This is normal, and +can accurately reflect the state of the world: for example, the link between +datacenters may have been lost, so hosts in one datacenter can no longer see +the master, while hosts in the other datacenter still have a healthy link to +it. Backups ====== Even if you configure replication, you should still retain separate backup snapshots. Replicas protect you from data loss if you lose a host, but they do not let you recover from data mutation mistakes. If something issues `DELETE` or `UPDATE` statements and destroys data on the master, the mutation will propagate to the replicas almost immediately and the data will be gone forever. Normally, the only way to recover this data is from backup snapshots. Although you should still have a backup process, your backup process can safely pull dumps from a replica instead of the master. This operation can be slow, so offloading it to a replica can make the perforance of the master more consistent. To dump from a replica, wait for this TODO to be resolved and then do whatever it says to do: TODO: Make `bin/storage dump` replica-aware. See T10758. Next Steps ========== Continue by: - returning to @{article:Clustering Introduction}. diff --git a/src/infrastructure/cluster/PhabricatorDatabaseHealthRecord.php b/src/infrastructure/cluster/PhabricatorDatabaseHealthRecord.php index 54530e4959..580b3f1b27 100644 --- a/src/infrastructure/cluster/PhabricatorDatabaseHealthRecord.php +++ b/src/infrastructure/cluster/PhabricatorDatabaseHealthRecord.php @@ -1,185 +1,187 @@ ref = $ref; $this->readState(); } /** * Is the database currently healthy? */ public function getIsHealthy() { return $this->isHealthy; } /** * Should this request check database health? */ public function getShouldCheck() { return $this->shouldCheck; } /** * How many recent health checks were successful? */ public function getUpEventCount() { return $this->upEventCount; } /** * How many recent health checks failed? */ public function getDownEventCount() { return $this->downEventCount; } /** * Number of failures or successes we need to see in a row before we change * the state. */ public function getRequiredEventCount() { + // NOTE: If you change this value, update the "Cluster: Databases" docs. return 5; } /** * Seconds to wait between health checks. */ public function getHealthCheckFrequency() { + // NOTE: If you change this value, update the "Cluster: Databases" docs. return 3; } public function didHealthCheck($result) { $now = microtime(true); $check_frequency = $this->getHealthCheckFrequency(); $event_count = $this->getRequiredEventCount(); $record = $this->readHealthRecord(); $log = $record['log']; foreach ($log as $key => $event) { $when = idx($event, 'timestamp'); // If the log already has another nearby event, just ignore this one. // We raced with another process and our result can just be thrown away. if (($now - $when) <= $check_frequency) { return $this; } } $log[] = array( 'timestamp' => $now, 'up' => $result, ); // Throw away older events which are now obsolete. $log = array_slice($log, -$event_count); $count_up = 0; $count_down = 0; foreach ($log as $event) { if ($event['up']) { $count_up++; } else { $count_down++; } } // If all of the events are the same, change the state. if ($count_up == $event_count) { $record['up'] = true; } else if ($count_down == $event_count) { $record['up'] = false; } $record['log'] = $log; $this->writeHealthRecord($record); $this->isHealthy = $record['up']; $this->shouldCheck = false; $this->updateStatistics($record); return $this; } private function readState() { $now = microtime(true); $check_frequency = $this->getHealthCheckFrequency(); $record = $this->readHealthRecord(); $last_check = $record['lastCheck']; if (($now - $last_check) >= $check_frequency) { $record['lastCheck'] = $now; $this->writeHealthRecord($record); $this->shouldCheck = true; } else { $this->shouldCheck = false; } $this->isHealthy = $record['up']; $this->updateStatistics($record); } private function updateStatistics(array $record) { $this->upEventCount = 0; $this->downEventCount = 0; foreach ($record['log'] as $event) { if ($event['up']) { $this->upEventCount++; } else { $this->downEventCount++; } } } private function getHealthRecordCacheKey() { $ref = $this->ref; $host = $ref->getHost(); $port = $ref->getPort(); return "cluster.db.health({$host}, {$port})"; } private function readHealthRecord() { $cache = PhabricatorCaches::getSetupCache(); $cache_key = $this->getHealthRecordCacheKey(); $health_record = $cache->getKey($cache_key); if (!is_array($health_record)) { $health_record = array( 'up' => true, 'lastCheck' => 0, 'log' => array(), ); } return $health_record; } private function writeHealthRecord(array $record) { $cache = PhabricatorCaches::getSetupCache(); $cache_key = $this->getHealthRecordCacheKey(); $cache->setKey($cache_key, $record); } } diff --git a/src/infrastructure/cluster/PhabricatorDatabaseRef.php b/src/infrastructure/cluster/PhabricatorDatabaseRef.php index 207901f010..f8ca7a79a8 100644 --- a/src/infrastructure/cluster/PhabricatorDatabaseRef.php +++ b/src/infrastructure/cluster/PhabricatorDatabaseRef.php @@ -1,485 +1,521 @@ host = $host; return $this; } public function getHost() { return $this->host; } public function setPort($port) { $this->port = $port; return $this; } public function getPort() { return $this->port; } public function setUser($user) { $this->user = $user; return $this; } public function getUser() { return $this->user; } public function setPass(PhutilOpaqueEnvelope $pass) { $this->pass = $pass; return $this; } public function getPass() { return $this->pass; } public function setIsMaster($is_master) { $this->isMaster = $is_master; return $this; } public function getIsMaster() { return $this->isMaster; } public function setDisabled($disabled) { $this->disabled = $disabled; return $this; } public function getDisabled() { return $this->disabled; } public function setConnectionLatency($connection_latency) { $this->connectionLatency = $connection_latency; return $this; } public function getConnectionLatency() { return $this->connectionLatency; } public function setConnectionStatus($connection_status) { $this->connectionStatus = $connection_status; return $this; } public function getConnectionStatus() { if ($this->connectionStatus === null) { throw new PhutilInvalidStateException('queryAll'); } return $this->connectionStatus; } public function setConnectionMessage($connection_message) { $this->connectionMessage = $connection_message; return $this; } public function getConnectionMessage() { return $this->connectionMessage; } public function setReplicaStatus($replica_status) { $this->replicaStatus = $replica_status; return $this; } public function getReplicaStatus() { return $this->replicaStatus; } public function setReplicaMessage($replica_message) { $this->replicaMessage = $replica_message; return $this; } public function getReplicaMessage() { return $this->replicaMessage; } public function setReplicaDelay($replica_delay) { $this->replicaDelay = $replica_delay; return $this; } public function getReplicaDelay() { return $this->replicaDelay; } + public function setIsIndividual($is_individual) { + $this->isIndividual = $is_individual; + return $this; + } + + public function getIsIndividual() { + return $this->isIndividual; + } + public static function getConnectionStatusMap() { return array( self::STATUS_OKAY => array( 'icon' => 'fa-exchange', 'color' => 'green', 'label' => pht('Okay'), ), self::STATUS_FAIL => array( 'icon' => 'fa-times', 'color' => 'red', 'label' => pht('Failed'), ), self::STATUS_AUTH => array( 'icon' => 'fa-key', 'color' => 'red', 'label' => pht('Invalid Credentials'), ), self::STATUS_REPLICATION_CLIENT => array( 'icon' => 'fa-eye-slash', 'color' => 'yellow', 'label' => pht('Missing Permission'), ), ); } public static function getReplicaStatusMap() { return array( self::REPLICATION_OKAY => array( 'icon' => 'fa-download', 'color' => 'green', 'label' => pht('Okay'), ), self::REPLICATION_MASTER_REPLICA => array( 'icon' => 'fa-database', 'color' => 'red', 'label' => pht('Replicating Master'), ), self::REPLICATION_REPLICA_NONE => array( 'icon' => 'fa-download', 'color' => 'red', 'label' => pht('Not Replicating'), ), self::REPLICATION_SLOW => array( 'icon' => 'fa-hourglass', 'color' => 'red', 'label' => pht('Slow Replication'), ), ); } public static function getLiveRefs() { $cache = PhabricatorCaches::getRequestCache(); $refs = $cache->getKey(self::KEY_REFS); if (!$refs) { $refs = self::newRefs(); $cache->setKey(self::KEY_REFS, $refs); } return $refs; } + public static function getLiveIndividualRef() { + $cache = PhabricatorCaches::getRequestCache(); + + $ref = $cache->getKey(self::KEY_INDIVIDUAL); + if (!$ref) { + $ref = self::newIndividualRef(); + $cache->setKey(self::KEY_INDIVIDUAL, $ref); + } + + return $ref; + } + public static function newRefs() { $refs = array(); $default_port = PhabricatorEnv::getEnvConfig('mysql.port'); $default_port = nonempty($default_port, 3306); $default_user = PhabricatorEnv::getEnvConfig('mysql.user'); $default_pass = PhabricatorEnv::getEnvConfig('mysql.pass'); $default_pass = new PhutilOpaqueEnvelope($default_pass); $config = PhabricatorEnv::getEnvConfig('cluster.databases'); foreach ($config as $server) { $host = $server['host']; $port = idx($server, 'port', $default_port); $user = idx($server, 'user', $default_user); $disabled = idx($server, 'disabled', false); $pass = idx($server, 'pass'); if ($pass) { $pass = new PhutilOpaqueEnvelope($pass); } else { $pass = clone $default_pass; } $role = $server['role']; $ref = id(new self()) ->setHost($host) ->setPort($port) ->setUser($user) ->setPass($pass) ->setDisabled($disabled) ->setIsMaster(($role == 'master')); $refs[] = $ref; } return $refs; } public static function queryAll() { $refs = self::newRefs(); foreach ($refs as $ref) { if ($ref->getDisabled()) { continue; } $conn = $ref->newManagementConnection(); $t_start = microtime(true); $replica_status = false; try { $replica_status = queryfx_one($conn, 'SHOW SLAVE STATUS'); $ref->setConnectionStatus(self::STATUS_OKAY); } catch (AphrontAccessDeniedQueryException $ex) { $ref->setConnectionStatus(self::STATUS_REPLICATION_CLIENT); $ref->setConnectionMessage( pht( 'No permission to run "SHOW SLAVE STATUS". Grant this user '. '"REPLICATION CLIENT" permission to allow Phabricator to '. 'monitor replica health.')); } catch (AphrontInvalidCredentialsQueryException $ex) { $ref->setConnectionStatus(self::STATUS_AUTH); $ref->setConnectionMessage($ex->getMessage()); } catch (AphrontQueryException $ex) { $ref->setConnectionStatus(self::STATUS_FAIL); $class = get_class($ex); $message = $ex->getMessage(); $ref->setConnectionMessage( pht( '%s: %s', get_class($ex), $ex->getMessage())); } $t_end = microtime(true); $ref->setConnectionLatency($t_end - $t_start); if ($replica_status !== false) { $is_replica = (bool)$replica_status; if ($ref->getIsMaster() && $is_replica) { $ref->setReplicaStatus(self::REPLICATION_MASTER_REPLICA); $ref->setReplicaMessage( pht( 'This host has a "master" role, but is replicating data from '. 'another host ("%s")!', idx($replica_status, 'Master_Host'))); } else if (!$ref->getIsMaster() && !$is_replica) { $ref->setReplicaStatus(self::REPLICATION_REPLICA_NONE); $ref->setReplicaMessage( pht( 'This host has a "replica" role, but is not replicating data '. 'from a master (no output from "SHOW SLAVE STATUS").')); } else { $ref->setReplicaStatus(self::REPLICATION_OKAY); } if ($is_replica) { $latency = (int)idx($replica_status, 'Seconds_Behind_Master'); $ref->setReplicaDelay($latency); if ($latency > 30) { $ref->setReplicaStatus(self::REPLICATION_SLOW); $ref->setReplicaMessage( pht( 'This replica is lagging far behind the master. Data is at '. 'risk!')); } } } } return $refs; } public function newManagementConnection() { return $this->newConnection( array( 'retries' => 0, 'timeout' => 2, )); } public function newApplicationConnection($database) { return $this->newConnection( array( 'database' => $database, )); } public function isSevered() { + // If we only have an individual database, never sever our connection to + // it, at least for now. It's possible that using the same severing rules + // might eventually make sense to help alleviate load-related failures, + // but we should wait for all the cluster stuff to stabilize first. + if ($this->getIsIndividual()) { + return false; + } + if ($this->didFailToConnect) { return true; } $record = $this->getHealthRecord(); $is_healthy = $record->getIsHealthy(); if (!$is_healthy) { return true; } return false; } public function isReachable(AphrontDatabaseConnection $connection) { $record = $this->getHealthRecord(); $should_check = $record->getShouldCheck(); if ($this->isSevered() && !$should_check) { return false; } try { $connection->openConnection(); $reachable = true; } catch (Exception $ex) { $reachable = false; } if ($should_check) { $record->didHealthCheck($reachable); } if (!$reachable) { $this->didFailToConnect = true; } return $reachable; } public function checkHealth() { $health = $this->getHealthRecord(); $should_check = $health->getShouldCheck(); if ($should_check) { // This does an implicit health update. $connection = $this->newManagementConnection(); $this->isReachable($connection); } return $this; } public function getHealthRecord() { if (!$this->healthRecord) { $this->healthRecord = new PhabricatorDatabaseHealthRecord($this); } return $this->healthRecord; } public static function getMasterDatabaseRef() { $refs = self::getLiveRefs(); if (!$refs) { - $conf = PhabricatorEnv::newObjectFromConfig( - 'mysql.configuration-provider', - array(null, 'w', null)); - - return id(new self()) - ->setHost($conf->getHost()) - ->setPort($conf->getPort()) - ->setUser($conf->getUser()) - ->setPass($conf->getPassword()) - ->setIsMaster(true); + return self::getLiveIndividualRef(); } $master = null; foreach ($refs as $ref) { if ($ref->getDisabled()) { continue; } if ($ref->getIsMaster()) { return $ref; } } return null; } + public static function newIndividualRef() { + $conf = PhabricatorEnv::newObjectFromConfig( + 'mysql.configuration-provider', + array(null, 'w', null)); + + return id(new self()) + ->setHost($conf->getHost()) + ->setPort($conf->getPort()) + ->setUser($conf->getUser()) + ->setPass($conf->getPassword()) + ->setIsIndividual(true) + ->setIsMaster(true); + } + public static function getReplicaDatabaseRef() { $refs = self::getLiveRefs(); if (!$refs) { return null; } // TODO: We may have multiple replicas to choose from, and could make // more of an effort to pick the "best" one here instead of always // picking the first one. Once we've picked one, we should try to use // the same replica for the rest of the request, though. foreach ($refs as $ref) { if ($ref->getDisabled()) { continue; } if ($ref->getIsMaster()) { continue; } return $ref; } return null; } private function newConnection(array $options) { // If we believe the database is unhealthy, don't spend as much time // trying to connect to it, since it's likely to continue to fail and // hammering it can only make the problem worse. $record = $this->getHealthRecord(); if ($record->getIsHealthy()) { $default_retries = 3; $default_timeout = 10; } else { $default_retries = 0; $default_timeout = 2; } $spec = $options + array( 'user' => $this->getUser(), 'pass' => $this->getPass(), 'host' => $this->getHost(), 'port' => $this->getPort(), 'database' => null, 'retries' => $default_retries, 'timeout' => $default_timeout, ); return PhabricatorEnv::newObjectFromConfig( 'mysql.implementation', array( $spec, )); } }