diff --git a/src/applications/drydock/worker/DrydockLeaseUpdateWorker.php b/src/applications/drydock/worker/DrydockLeaseUpdateWorker.php --- a/src/applications/drydock/worker/DrydockLeaseUpdateWorker.php +++ b/src/applications/drydock/worker/DrydockLeaseUpdateWorker.php @@ -188,137 +188,360 @@ // First, try to find a suitable open resource which we can acquire a new // lease on. - $resources = $this->loadResourcesForAllocatingLease($blueprints, $lease); + $resources = $this->loadAcquirableResourcesForLease($blueprints, $lease); - // If no resources exist yet, see if we can build one. - if (!$resources) { - $usable_blueprints = $this->removeOverallocatedBlueprints( - $blueprints, - $lease); - - // If we get nothing back here, some blueprint claims it can eventually - // satisfy the lease, just not right now. This is a temporary failure, - // and we expect allocation to succeed eventually. - if (!$usable_blueprints) { - $blueprints = $this->rankBlueprints($blueprints, $lease); - - // Try to actively reclaim unused resources. If we succeed, jump back - // into the queue in an effort to claim it. - foreach ($blueprints as $blueprint) { - $reclaimed = $this->reclaimResources($blueprint, $lease); - if ($reclaimed) { - $lease->logEvent( - DrydockLeaseReclaimLogType::LOGCONST, - array( - 'resourcePHIDs' => array($reclaimed->getPHID()), - )); - - throw new PhabricatorWorkerYieldException(15); - } - } + list($free_resources, $used_resources) = $this->partitionResources( + $lease, + $resources); + + $resource = $this->leaseAnyResource($lease, $free_resources); + if ($resource) { + return $resource; + } + + // We're about to try creating a resource. If we're already creating + // something, just yield until that resolves. + + $this->yieldForPendingResources($lease); + + // We haven't been able to lease an existing resource yet, so now we try to + // create one. We may still have some less-desirable "used" resources that + // we'll sometimes try to lease later if we fail to allocate a new resource. + + $resource = $this->newLeasedResource($lease, $blueprints); + if ($resource) { + return $resource; + } + + // We haven't been able to lease a desirable "free" resource or create a + // new resource. Try to lease a "used" resource. + + $resource = $this->leaseAnyResource($lease, $used_resources); + if ($resource) { + return $resource; + } + + // If this lease has already triggered a reclaim, just yield and wait for + // it to resolve. + $this->yieldForReclaimingResources($lease); + + // Try to reclaim a resource. This will yield if it reclaims something. + $this->reclaimAnyResource($lease, $blueprints); + + // We weren't able to lease, create, or reclaim any resources. We just have + // to wait for resources to become available. + + $lease->logEvent( + DrydockLeaseWaitingForResourcesLogType::LOGCONST, + array( + 'blueprintPHIDs' => mpull($blueprints, 'getPHID'), + )); + + throw new PhabricatorWorkerYieldException(15); + } + + private function reclaimAnyResource(DrydockLease $lease, array $blueprints) { + assert_instances_of($blueprints, 'DrydockBlueprint'); + + $blueprints = $this->rankBlueprints($blueprints, $lease); + + // Try to actively reclaim unused resources. If we succeed, jump back + // into the queue in an effort to claim it. + + foreach ($blueprints as $blueprint) { + $reclaimed = $this->reclaimResources($blueprint, $lease); + if ($reclaimed) { $lease->logEvent( - DrydockLeaseWaitingForResourcesLogType::LOGCONST, + DrydockLeaseReclaimLogType::LOGCONST, array( - 'blueprintPHIDs' => mpull($blueprints, 'getPHID'), + 'resourcePHIDs' => array($reclaimed->getPHID()), )); + // Yield explicitly here: we'll be awakened when the resource is + // reclaimed. + throw new PhabricatorWorkerYieldException(15); } + } + } - $usable_blueprints = $this->rankBlueprints($usable_blueprints, $lease); - - $exceptions = array(); - foreach ($usable_blueprints as $blueprint) { - try { - $resources[] = $this->allocateResource($blueprint, $lease); - - // Bail after allocating one resource, we don't need any more than - // this. - break; - } catch (Exception $ex) { - // This failure is not normally expected, so log it. It can be - // caused by something mundane and recoverable, however (see below - // for discussion). - - // We log to the blueprint separately from the log to the lease: - // the lease is not attached to a blueprint yet so the lease log - // will not show up on the blueprint; more than one blueprint may - // fail; and the lease is not really impacted (and won't log) if at - // least one blueprint actually works. - - $blueprint->logEvent( - DrydockResourceAllocationFailureLogType::LOGCONST, - array( - 'class' => get_class($ex), - 'message' => $ex->getMessage(), - )); + private function yieldForPendingResources(DrydockLease $lease) { + // See T13677. If this lease has already triggered the allocation of + // one or more resources and they are still pending, just yield and + // wait for them. - $exceptions[] = $ex; - } + $viewer = $this->getViewer(); + + $phids = $lease->getAllocatedResourcePHIDs(); + if (!$phids) { + return null; + } + + $resources = id(new DrydockResourceQuery()) + ->setViewer($viewer) + ->withPHIDs($phids) + ->withStatuses( + array( + DrydockResourceStatus::STATUS_PENDING, + )) + ->setLimit(1) + ->execute(); + if (!$resources) { + return; + } + + $lease->logEvent( + DrydockLeaseWaitingForActivationLogType::LOGCONST, + array( + 'resourcePHIDs' => mpull($resources, 'getPHID'), + )); + + throw new PhabricatorWorkerYieldException(15); + } + + private function yieldForReclaimingResources(DrydockLease $lease) { + $viewer = $this->getViewer(); + + $phids = $lease->getReclaimedResourcePHIDs(); + if (!$phids) { + return; + } + + $resources = id(new DrydockResourceQuery()) + ->setViewer($viewer) + ->withPHIDs($phids) + ->withStatuses( + array( + DrydockResourceStatus::STATUS_ACTIVE, + )) + ->setLimit(1) + ->execute(); + if (!$resources) { + return; + } + + $lease->logEvent( + DrydockLeaseWaitingForReclamationLogType::LOGCONST, + array( + 'resourcePHIDs' => mpull($resources, 'getPHID'), + )); + + throw new PhabricatorWorkerYieldException(15); + } + + private function newLeasedResource( + DrydockLease $lease, + array $blueprints) { + assert_instances_of($blueprints, 'DrydockBlueprint'); + + $usable_blueprints = $this->removeOverallocatedBlueprints( + $blueprints, + $lease); + + // If we get nothing back here, some blueprint claims it can eventually + // satisfy the lease, just not right now. This is a temporary failure, + // and we expect allocation to succeed eventually. + + // Return, try to lease a "used" resource, and continue from there. + + if (!$usable_blueprints) { + return null; + } + + $usable_blueprints = $this->rankBlueprints($usable_blueprints, $lease); + + $new_resources = $this->newResources($lease, $usable_blueprints); + if (!$new_resources) { + // If we were unable to create any new resources, return and + // try to lease a "used" resource. + return null; + } + + $new_resources = $this->removeUnacquirableResources( + $new_resources, + $lease); + if (!$new_resources) { + // If we make it here, we just built a resource but aren't allowed + // to acquire it. We expect this to happen if the resource prevents + // acquisition until it activates, which is common when a resource + // needs to perform setup steps. + + // Explicitly yield and wait for activation, since we don't want to + // lease a "used" resource. + + throw new PhabricatorWorkerYieldException(15); + } + + $resource = $this->leaseAnyResource($lease, $new_resources); + if ($resource) { + return $resource; + } + + // We may not be able to lease a resource even if we just built it: + // another process may snatch it up before we can lease it. This should + // be rare, but is not concerning. Just try to build another resource. + + // We likely could try to build the next resource immediately, but err on + // the side of caution and yield for now, at least until this code is + // better vetted. + + throw new PhabricatorWorkerYieldException(15); + } + + private function partitionResources( + DrydockLease $lease, + array $resources) { + + assert_instances_of($resources, 'DrydockResource'); + $viewer = $this->getViewer(); + + $lease_statuses = array( + DrydockLeaseStatus::STATUS_PENDING, + DrydockLeaseStatus::STATUS_ACQUIRED, + DrydockLeaseStatus::STATUS_ACTIVE, + ); + + // Partition resources into "free" resources (which we can try to lease + // immediately) and "used" resources, which we can only to lease after we + // fail to allocate a new resource. + + // "Free" resources are unleased and/or prefer reuse over allocation. + // "Used" resources are leased and prefer allocation over reuse. + + $free_resources = array(); + $used_resources = array(); + + foreach ($resources as $resource) { + $blueprint = $resource->getBlueprint(); + + if (!$blueprint->shouldAllocateSupplementalResource($resource, $lease)) { + $free_resources[] = $resource; + continue; } - if (!$resources) { - // If one or more blueprints claimed that they would be able to - // allocate resources but none are actually able to allocate resources, - // log the failure and yield so we try again soon. + $leases = id(new DrydockLeaseQuery()) + ->setViewer($viewer) + ->withResourcePHIDs(array($resource->getPHID())) + ->withStatuses($lease_statuses) + ->setLimit(1) + ->execute(); + if (!$leases) { + $free_resources[] = $resource; + continue; + } - // This can happen if some unexpected issue occurs during allocation - // (for example, a call to build a VM fails for some reason) or if we - // raced another allocator and the blueprint is now full. + $used_resources[] = $resource; + } - $ex = new PhutilAggregateException( - pht( - 'All blueprints failed to allocate a suitable new resource when '. - 'trying to allocate lease ("%s").', - $lease->getPHID()), - $exceptions); + return array($free_resources, $used_resources); + } - $lease->logEvent( - DrydockLeaseAllocationFailureLogType::LOGCONST, + private function newResources( + DrydockLease $lease, + array $blueprints) { + assert_instances_of($blueprints, 'DrydockBlueprint'); + + $resources = array(); + $exceptions = array(); + foreach ($blueprints as $blueprint) { + $caught = null; + try { + $resources[] = $this->allocateResource($blueprint, $lease); + + // Bail after allocating one resource, we don't need any more than + // this. + break; + } catch (Exception $ex) { + $caught = $ex; + } catch (Throwable $ex) { + $caught = $ex; + } + + if ($caught) { + // This failure is not normally expected, so log it. It can be + // caused by something mundane and recoverable, however (see below + // for discussion). + + // We log to the blueprint separately from the log to the lease: + // the lease is not attached to a blueprint yet so the lease log + // will not show up on the blueprint; more than one blueprint may + // fail; and the lease is not really impacted (and won't log) if at + // least one blueprint actually works. + + $blueprint->logEvent( + DrydockResourceAllocationFailureLogType::LOGCONST, array( - 'class' => get_class($ex), - 'message' => $ex->getMessage(), + 'class' => get_class($caught), + 'message' => $caught->getMessage(), )); - throw new PhabricatorWorkerYieldException(15); + $exceptions[] = $caught; } + } - $resources = $this->removeUnacquirableResources($resources, $lease); - if (!$resources) { - // If we make it here, we just built a resource but aren't allowed - // to acquire it. We expect this during routine operation if the - // resource prevents acquisition until it activates. Yield and wait - // for activation. - throw new PhabricatorWorkerYieldException(15); - } + if (!$resources) { + // If one or more blueprints claimed that they would be able to allocate + // resources but none are actually able to allocate resources, log the + // failure and yield so we try again soon. + + // This can happen if some unexpected issue occurs during allocation + // (for example, a call to build a VM fails for some reason) or if we + // raced another allocator and the blueprint is now full. + + $ex = new PhutilAggregateException( + pht( + 'All blueprints failed to allocate a suitable new resource when '. + 'trying to allocate lease ("%s").', + $lease->getPHID()), + $exceptions); - // NOTE: We have not acquired the lease yet, so it is possible that the - // resource we just built will be snatched up by some other lease before - // we can acquire it. This is not problematic: we'll retry a little later - // and should succeed eventually. + $lease->logEvent( + DrydockLeaseAllocationFailureLogType::LOGCONST, + array( + 'class' => get_class($ex), + 'message' => $ex->getMessage(), + )); + + return null; + } + + return $resources; + } + + + private function leaseAnyResource( + DrydockLease $lease, + array $resources) { + assert_instances_of($resources, 'DrydockResource'); + + if (!$resources) { + return null; } $resources = $this->rankResources($resources, $lease); $exceptions = array(); $yields = array(); - $allocated = false; + + $allocated = null; foreach ($resources as $resource) { try { - $resource = $this->newResourceForAcquisition($resource, $lease); $this->acquireLease($resource, $lease); - $allocated = true; + $allocated = $resource; break; } catch (DrydockResourceLockException $ex) { // We need to lock the resource to actually acquire it. If we aren't // able to acquire the lock quickly enough, we can yield and try again // later. $yields[] = $ex; + } catch (DrydockSlotLockException $ex) { + // This also just indicates we ran into some kind of contention, + // probably from another lease. Just yield. + $yields[] = $ex; } catch (DrydockAcquiredBrokenResourceException $ex) { // If a resource was reclaimed or destroyed by the time we actually - // got around to acquiring it, we just got unlucky. We can yield and - // try again later. + // got around to acquiring it, we just got unlucky. $yields[] = $ex; } catch (PhabricatorWorkerYieldException $ex) { // We can be told to yield, particularly by the supplemental allocator @@ -329,17 +552,19 @@ } } - if (!$allocated) { - if ($yields) { - throw new PhabricatorWorkerYieldException(15); - } else { - throw new PhutilAggregateException( - pht( - 'Unable to acquire lease "%s" on any resource.', - $lease->getPHID()), - $exceptions); - } + if ($allocated) { + return $allocated; + } + + if ($yields) { + throw new PhabricatorWorkerYieldException(15); } + + throw new PhutilAggregateException( + pht( + 'Unable to acquire lease "%s" on any resource.', + $lease->getPHID()), + $exceptions); } @@ -426,7 +651,7 @@ * the lease. * @task allocator */ - private function loadResourcesForAllocatingLease( + private function loadAcquirableResourcesForLease( array $blueprints, DrydockLease $lease) { assert_instances_of($blueprints, 'DrydockBlueprint'); @@ -438,7 +663,6 @@ ->withTypes(array($lease->getResourceType())) ->withStatuses( array( - DrydockResourceStatus::STATUS_PENDING, DrydockResourceStatus::STATUS_ACTIVE, )) ->execute(); @@ -558,6 +782,13 @@ // If this resource was allocated as a pending resource, queue a task to // activate it. if ($resource->getStatus() == DrydockResourceStatus::STATUS_PENDING) { + + $lease->addAllocatedResourcePHIDs( + array( + $resource->getPHID(), + )); + $lease->save(); + PhabricatorWorker::scheduleTask( 'DrydockResourceUpdateWorker', array( @@ -636,26 +867,6 @@ DrydockLease $lease) { $viewer = $this->getViewer(); - // If this lease is marked as already in the process of reclaiming a - // resource, don't let it reclaim another one until the first reclaim - // completes. This stops one lease from reclaiming a large number of - // resources if the reclaims take a while to complete. - $reclaiming_phid = $lease->getAttribute('drydock.reclaimingPHID'); - if ($reclaiming_phid) { - $reclaiming_resource = id(new DrydockResourceQuery()) - ->setViewer($viewer) - ->withPHIDs(array($reclaiming_phid)) - ->withStatuses( - array( - DrydockResourceStatus::STATUS_ACTIVE, - DrydockResourceStatus::STATUS_RELEASED, - )) - ->executeOne(); - if ($reclaiming_resource) { - return null; - } - } - $resources = id(new DrydockResourceQuery()) ->setViewer($viewer) ->withBlueprintPHIDs(array($blueprint->getPHID())) @@ -754,73 +965,6 @@ } } - private function newResourceForAcquisition( - DrydockResource $resource, - DrydockLease $lease) { - - // If the resource has no leases against it, never build a new one. This is - // likely already a new resource that just activated. - $viewer = $this->getViewer(); - - $statuses = array( - DrydockLeaseStatus::STATUS_PENDING, - DrydockLeaseStatus::STATUS_ACQUIRED, - DrydockLeaseStatus::STATUS_ACTIVE, - ); - - $leases = id(new DrydockLeaseQuery()) - ->setViewer($viewer) - ->withResourcePHIDs(array($resource->getPHID())) - ->withStatuses($statuses) - ->setLimit(1) - ->execute(); - if (!$leases) { - return $resource; - } - - // If we're about to get a lease on a resource, check if the blueprint - // wants to allocate a supplemental resource. If it does, try to perform a - // new allocation instead. - $blueprint = $resource->getBlueprint(); - if (!$blueprint->shouldAllocateSupplementalResource($resource, $lease)) { - return $resource; - } - - // If the blueprint is already overallocated, we can't allocate a new - // resource. Just return the existing resource. - $remaining = $this->removeOverallocatedBlueprints( - array($blueprint), - $lease); - if (!$remaining) { - return $resource; - } - - // Try to build a new resource. - try { - $new_resource = $this->allocateResource($blueprint, $lease); - } catch (Exception $ex) { - $blueprint->logEvent( - DrydockResourceAllocationFailureLogType::LOGCONST, - array( - 'class' => get_class($ex), - 'message' => $ex->getMessage(), - )); - - return $resource; - } - - // If we can't actually acquire the new resource yet, just yield. - // (We could try to move forward with the original resource instead.) - $acquirable = $this->removeUnacquirableResources( - array($new_resource), - $lease); - if (!$acquirable) { - throw new PhabricatorWorkerYieldException(15); - } - - return $new_resource; - } - /* -( Activating Leases )-------------------------------------------------- */ diff --git a/src/applications/drydock/worker/DrydockWorker.php b/src/applications/drydock/worker/DrydockWorker.php --- a/src/applications/drydock/worker/DrydockWorker.php +++ b/src/applications/drydock/worker/DrydockWorker.php @@ -260,7 +260,7 @@ // Mark the lease as reclaiming this resource. It won't be allowed to start // another reclaim as long as this resource is still in the process of // being reclaimed. - $lease->setAttribute('drydock.reclaimingPHID', $resource->getPHID()); + $lease->addReclaimedResourcePHIDs(array($resource->getPHID())); // When the resource releases, we we want to reawaken this task since it // should (usually) be able to start building a new resource right away.