Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F14846922
D15792.id.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
12 KB
Referenced Files
None
Subscribers
None
D15792.id.diff
View Options
diff --git a/resources/sql/autopatches/20160424.locks.1.sql b/resources/sql/autopatches/20160424.locks.1.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20160424.locks.1.sql
@@ -0,0 +1,2 @@
+ALTER TABLE {$NAMESPACE}_repository.repository_workingcopyversion
+ ADD lockOwner VARCHAR(255) COLLATE {$COLLATE_TEXT};
diff --git a/src/applications/diffusion/protocol/DiffusionRepositoryClusterEngine.php b/src/applications/diffusion/protocol/DiffusionRepositoryClusterEngine.php
--- a/src/applications/diffusion/protocol/DiffusionRepositoryClusterEngine.php
+++ b/src/applications/diffusion/protocol/DiffusionRepositoryClusterEngine.php
@@ -11,9 +11,11 @@
private $repository;
private $viewer;
+ private $logger;
+
private $clusterWriteLock;
private $clusterWriteVersion;
- private $logger;
+ private $clusterWriteOwner;
/* -( Configuring Synchronization )---------------------------------------- */
@@ -247,9 +249,14 @@
$device = AlmanacKeys::getLiveDevice();
$device_phid = $device->getPHID();
+ $table = new PhabricatorRepositoryWorkingCopyVersion();
+ $locked_connection = $table->establishConnection('w');
+
$write_lock = PhabricatorRepositoryWorkingCopyVersion::getWriteLock(
$repository_phid);
+ $write_lock->useSpecificConnection($locked_connection);
+
$lock_wait = phutil_units('2 minutes in seconds');
$this->logLine(
@@ -290,7 +297,7 @@
throw new Exception(
pht(
'An previous write to this repository was interrupted; refusing '.
- 'new writes. This issue resolves operator intervention to resolve, '.
+ 'new writes. This issue requires operator intervention to resolve, '.
'see "Write Interruptions" in the "Cluster: Repositories" in the '.
'documentation for instructions.'));
}
@@ -302,14 +309,20 @@
throw $ex;
}
+ $pid = getmypid();
+ $hash = Filesystem::readRandomCharacters(12);
+ $this->clusterWriteOwner = "{$pid}.{$hash}";
+
PhabricatorRepositoryWorkingCopyVersion::willWrite(
+ $locked_connection,
$repository_phid,
$device_phid,
array(
'userPHID' => $viewer->getPHID(),
'epoch' => PhabricatorTime::getNow(),
'devicePHID' => $device_phid,
- ));
+ ),
+ $this->clusterWriteOwner);
$this->clusterWriteVersion = $max_version;
$this->clusterWriteLock = $write_lock;
@@ -337,31 +350,105 @@
$device = AlmanacKeys::getLiveDevice();
$device_phid = $device->getPHID();
- // NOTE: This means we're still bumping the version when pushes fail. We
- // could select only un-rejected events instead to bump a little less
- // often.
+ // It is possible that we've lost the global lock while receiving the push.
+ // For example, the master database may have been restarted between the
+ // time we acquired the global lock and now, when the push has finished.
+
+ // We wrote a durable lock while we were holding the the global lock,
+ // essentially upgrading our lock. We can still safely release this upgraded
+ // lock even if we're no longer holding the global lock.
+
+ // If we fail to release the lock, the repository will be frozen until
+ // an operator can figure out what happened, so we try pretty hard to
+ // reconnect to the database and release the lock.
- $new_log = id(new PhabricatorRepositoryPushEventQuery())
- ->setViewer(PhabricatorUser::getOmnipotentUser())
- ->withRepositoryPHIDs(array($repository_phid))
- ->setLimit(1)
- ->executeOne();
+ $now = PhabricatorTime::getNow();
+ $duration = phutil_units('5 minutes in seconds');
+ $try_until = $now + $duration;
- $old_version = $this->clusterWriteVersion;
- if ($new_log) {
- $new_version = $new_log->getID();
+ $did_release = false;
+ $already_failed = false;
+ while (PhabricatorTime::getNow() <= $try_until) {
+ try {
+ // NOTE: This means we're still bumping the version when pushes fail. We
+ // could select only un-rejected events instead to bump a little less
+ // often.
+
+ $new_log = id(new PhabricatorRepositoryPushEventQuery())
+ ->setViewer(PhabricatorUser::getOmnipotentUser())
+ ->withRepositoryPHIDs(array($repository_phid))
+ ->setLimit(1)
+ ->executeOne();
+
+ $old_version = $this->clusterWriteVersion;
+ if ($new_log) {
+ $new_version = $new_log->getID();
+ } else {
+ $new_version = $old_version;
+ }
+
+ PhabricatorRepositoryWorkingCopyVersion::didWrite(
+ $repository_phid,
+ $device_phid,
+ $this->clusterWriteVersion,
+ $new_log->getID(),
+ $this->clusterWriteOwner);
+ $did_release = true;
+ break;
+ } catch (AphrontConnectionQueryException $ex) {
+ $connection_exception = $ex;
+ } catch (AphrontConnectionLostQueryException $ex) {
+ $connection_exception = $ex;
+ }
+
+ if (!$already_failed) {
+ $already_failed = true;
+ $this->logLine(
+ pht('CRITICAL. Failed to release cluster write lock!'));
+
+ $this->logLine(
+ pht(
+ 'The connection to the master database was lost while receiving '.
+ 'the write.'));
+
+ $this->logLine(
+ pht(
+ 'This process will spend %s more second(s) attempting to '.
+ 'recover, then give up.',
+ new PhutilNumber($duration)));
+ }
+
+ sleep(1);
+ }
+
+ if ($did_release) {
+ if ($already_failed) {
+ $this->logLine(
+ pht('RECOVERED. Link to master database was restored.'));
+ }
+ $this->logLine(pht('Released cluster write lock.'));
} else {
- $new_version = $old_version;
+ throw new Exception(
+ pht(
+ 'Failed to reconnect to master database and release held write '.
+ 'lock ("%s") on device "%s" for repository "%s" after trying '.
+ 'for %s seconds(s). This repository will be frozen.',
+ $this->clusterWriteOwner,
+ $device->getName(),
+ $this->getDisplayName(),
+ new PhutilNumber($duration)));
}
- PhabricatorRepositoryWorkingCopyVersion::didWrite(
- $repository_phid,
- $device_phid,
- $this->clusterWriteVersion,
- $new_log->getID());
+ // We can continue even if we've lost this lock, everything is still
+ // consistent.
+ try {
+ $this->clusterWriteLock->unlock();
+ } catch (Exception $ex) {
+ // Ignore.
+ }
- $this->clusterWriteLock->unlock();
$this->clusterWriteLock = null;
+ $this->clusterWriteOwner = null;
}
diff --git a/src/applications/diffusion/ssh/DiffusionGitSSHWorkflow.php b/src/applications/diffusion/ssh/DiffusionGitSSHWorkflow.php
--- a/src/applications/diffusion/ssh/DiffusionGitSSHWorkflow.php
+++ b/src/applications/diffusion/ssh/DiffusionGitSSHWorkflow.php
@@ -11,6 +11,7 @@
public function writeClusterEngineLogMessage($message) {
parent::writeError($message);
+ $this->getErrorChannel()->update();
}
protected function identifyRepository() {
diff --git a/src/applications/diffusion/ssh/DiffusionSSHWorkflow.php b/src/applications/diffusion/ssh/DiffusionSSHWorkflow.php
--- a/src/applications/diffusion/ssh/DiffusionSSHWorkflow.php
+++ b/src/applications/diffusion/ssh/DiffusionSSHWorkflow.php
@@ -55,6 +55,21 @@
return $this;
}
+ protected function getCurrentDeviceName() {
+ $device = AlmanacKeys::getLiveDevice();
+ if ($device) {
+ return $device->getName();
+ }
+
+ return php_uname('n');
+ }
+
+ protected function getTargetDeviceName() {
+ // TODO: This should use the correct device identity.
+ $uri = new PhutilURI($this->proxyURI);
+ return $uri->getDomain();
+ }
+
protected function shouldProxy() {
return (bool)$this->proxyURI;
}
diff --git a/src/applications/repository/storage/PhabricatorRepositoryWorkingCopyVersion.php b/src/applications/repository/storage/PhabricatorRepositoryWorkingCopyVersion.php
--- a/src/applications/repository/storage/PhabricatorRepositoryWorkingCopyVersion.php
+++ b/src/applications/repository/storage/PhabricatorRepositoryWorkingCopyVersion.php
@@ -7,6 +7,7 @@
protected $devicePHID;
protected $repositoryVersion;
protected $isWriting;
+ protected $lockOwner;
protected $writeProperties;
protected function getConfiguration() {
@@ -16,6 +17,7 @@
'repositoryVersion' => 'uint32',
'isWriting' => 'bool',
'writeProperties' => 'text?',
+ 'lockOwner' => 'text255?',
),
self::CONFIG_KEY_SCHEMA => array(
'key_workingcopy' => array(
@@ -69,29 +71,33 @@
* by default.
*/
public static function willWrite(
+ AphrontDatabaseConnection $locked_connection,
$repository_phid,
$device_phid,
- array $write_properties) {
+ array $write_properties,
+ $lock_owner) {
+
$version = new self();
- $conn_w = $version->establishConnection('w');
$table = $version->getTableName();
queryfx(
- $conn_w,
+ $locked_connection,
'INSERT INTO %T
(repositoryPHID, devicePHID, repositoryVersion, isWriting,
- writeProperties)
+ writeProperties, lockOwner)
VALUES
- (%s, %s, %d, %d, %s)
+ (%s, %s, %d, %d, %s, %s)
ON DUPLICATE KEY UPDATE
isWriting = VALUES(isWriting),
- writeProperties = VALUES(writeProperties)',
+ writeProperties = VALUES(writeProperties),
+ lockOwner = VALUES(lockOwner)',
$table,
$repository_phid,
$device_phid,
0,
1,
- phutil_json_encode($write_properties));
+ phutil_json_encode($write_properties),
+ $lock_owner);
}
@@ -102,7 +108,9 @@
$repository_phid,
$device_phid,
$old_version,
- $new_version) {
+ $new_version,
+ $lock_owner) {
+
$version = new self();
$conn_w = $version->establishConnection('w');
$table = $version->getTableName();
@@ -111,17 +119,20 @@
$conn_w,
'UPDATE %T SET
repositoryVersion = %d,
- isWriting = 0
+ isWriting = 0,
+ lockOwner = NULL
WHERE
repositoryPHID = %s AND
devicePHID = %s AND
repositoryVersion = %d AND
- isWriting = 1',
+ isWriting = 1 AND
+ lockOwner = %s',
$table,
$new_version,
$repository_phid,
$device_phid,
- $old_version);
+ $old_version,
+ $lock_owner);
}
diff --git a/src/infrastructure/util/PhabricatorGlobalLock.php b/src/infrastructure/util/PhabricatorGlobalLock.php
--- a/src/infrastructure/util/PhabricatorGlobalLock.php
+++ b/src/infrastructure/util/PhabricatorGlobalLock.php
@@ -29,6 +29,7 @@
final class PhabricatorGlobalLock extends PhutilLock {
private $conn;
+ private $isExternalConnection = false;
private static $pool = array();
@@ -74,6 +75,7 @@
*/
public function useSpecificConnection(AphrontDatabaseConnection $conn) {
$this->conn = $conn;
+ $this->isExternalConnection = true;
return $this;
}
@@ -109,29 +111,54 @@
$max_allowed_timeout = 2147483;
queryfx($conn, 'SET wait_timeout = %d', $max_allowed_timeout);
+ $lock_name = $this->getName();
+
$result = queryfx_one(
$conn,
'SELECT GET_LOCK(%s, %f)',
- $this->getName(),
+ $lock_name,
$wait);
$ok = head($result);
if (!$ok) {
- throw new PhutilLockException($this->getName());
+ throw new PhutilLockException($lock_name);
}
+ $conn->rememberLock($lock_name);
+
$this->conn = $conn;
}
protected function doUnlock() {
- queryfx(
- $this->conn,
- 'SELECT RELEASE_LOCK(%s)',
- $this->getName());
+ $lock_name = $this->getName();
+
+ $conn = $this->conn;
+
+ try {
+ $result = queryfx_one(
+ $conn,
+ 'SELECT RELEASE_LOCK(%s)',
+ $lock_name);
+ $conn->forgetLock($lock_name);
+ } catch (Exception $ex) {
+ $result = array(null);
+ }
+
+ $ok = head($result);
+ if (!$ok) {
+ // TODO: We could throw here, but then this lock doesn't get marked
+ // unlocked and we throw again later when exiting. It also doesn't
+ // particularly matter for any current applications. For now, just
+ // swallow the error.
+ }
- $this->conn->close();
- self::$pool[] = $this->conn;
$this->conn = null;
+ $this->isExternalConnection = false;
+
+ if (!$this->isExternalConnection) {
+ $conn->close();
+ self::$pool[] = $conn;
+ }
}
}
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Tue, Feb 4, 8:39 AM (15 h, 26 m)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
7089156
Default Alt Text
D15792.id.diff (12 KB)
Attached To
Mode
D15792: Make cluster repositories more resistant to freezing
Attached
Detach File
Event Timeline
Log In to Comment