diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php --- a/src/__phutil_library_map__.php +++ b/src/__phutil_library_map__.php @@ -2288,6 +2288,7 @@ 'PhabricatorSearchField' => 'applications/search/constants/PhabricatorSearchField.php', 'PhabricatorSearchHovercardController' => 'applications/search/controller/PhabricatorSearchHovercardController.php', 'PhabricatorSearchIndexer' => 'applications/search/index/PhabricatorSearchIndexer.php', + 'PhabricatorSearchManagementFindDuplicatesWorkflow' => 'applications/search/management/PhabricatorSearchManagementFindDuplicatesWorkflow.php', 'PhabricatorSearchManagementIndexWorkflow' => 'applications/search/management/PhabricatorSearchManagementIndexWorkflow.php', 'PhabricatorSearchManagementWorkflow' => 'applications/search/management/PhabricatorSearchManagementWorkflow.php', 'PhabricatorSearchOrderController' => 'applications/search/controller/PhabricatorSearchOrderController.php', @@ -5445,6 +5446,7 @@ 'PhabricatorSearchEngineElastic' => 'PhabricatorSearchEngine', 'PhabricatorSearchEngineMySQL' => 'PhabricatorSearchEngine', 'PhabricatorSearchHovercardController' => 'PhabricatorSearchBaseController', + 'PhabricatorSearchManagementFindDuplicatesWorkflow' => 'PhabricatorSearchManagementWorkflow', 'PhabricatorSearchManagementIndexWorkflow' => 'PhabricatorSearchManagementWorkflow', 'PhabricatorSearchManagementWorkflow' => 'PhabricatorManagementWorkflow', 'PhabricatorSearchOrderController' => 'PhabricatorSearchBaseController', diff --git a/src/applications/search/engine/PhabricatorSearchEngineElastic.php b/src/applications/search/engine/PhabricatorSearchEngineElastic.php --- a/src/applications/search/engine/PhabricatorSearchEngineElastic.php +++ b/src/applications/search/engine/PhabricatorSearchEngineElastic.php @@ -4,6 +4,7 @@ private $uri; private $index; private $timeout; + private $lastResponse; public function __construct($uri, $index) { $this->uri = $uri; @@ -19,6 +20,10 @@ return $this->timeout; } + public function getLastResponse() { + return $this->lastResponse; + } + public function reindexAbstractDocument( PhabricatorSearchAbstractDocument $doc) { @@ -41,6 +46,7 @@ foreach ($doc->getFieldData() as $field) { $spec['field'][] = array_combine(array('type', 'corpus', 'aux'), $field); + } foreach ($doc->getRelationshipData() as $relationship) { @@ -98,10 +104,12 @@ $filter = array(); if (strlen($query->getParameter('query'))) { + $query_field = $query->getParameter('query_field', 'field.corpus'); + $spec[] = array( 'match' => array( - 'field.corpus' => array( - 'operator' => 'and', + $query_field => array( + 'operator' => $query->getParameter('operator', 'and'), 'query' => $query->getParameter('query'), ), ), @@ -234,6 +242,7 @@ $response = $this->executeRequest($uri, $this->buildSpec($query)); } + $this->lastResponse = $response; $phids = ipull($response['hits']['hits'], '_id'); return $phids; } diff --git a/src/applications/search/management/PhabricatorSearchManagementFindDuplicatesWorkflow.php b/src/applications/search/management/PhabricatorSearchManagementFindDuplicatesWorkflow.php new file mode 100644 --- /dev/null +++ b/src/applications/search/management/PhabricatorSearchManagementFindDuplicatesWorkflow.php @@ -0,0 +1,93 @@ +setName('find_duplicates') + ->setSynopsis('Try to find analyze how effective duplicate task search would be.') + ->setExamples( + "**find_duplicates**\n"); + } + + public function execute(PhutilArgumentParser $args) { + $this->analyzeDuplicates(); + } + + private function analyzeDuplicates() { + + $search_engine = id(new PhabricatorDefaultSearchEngineSelector())->newEngine(); + + if (!($search_engine instanceof PhabricatorSearchEngineElastic)) { + throw new PhutilArgumentUsageException( + "This script currently only works if elasticsearch is enabled as a search backend\n". + "Please make sure that all your tasks have been indexed!"); + } + + $transactionTable = new ManiphestTransaction(); + $taskTable = new ManiphestTask(); + + $conn = $taskTable->establishConnection('r'); + $data = queryfx_all( + $conn, + 'SELECT t.phid AS dupPHID, t.title AS dupTitle, t.description AS dupDescription, t2.phid AS origPHID, t2.title AS origTitle, t2.description AS origDescription FROM %T t, %T tx, %T t2 + WHERE t.phid = tx.objectPHID + AND t.status = %s + AND tx.transactionType = %s + AND TRIM(BOTH \'"\' FROM tx.newValue) = t2.phid + ', + $taskTable->getTableName(), + $transactionTable->getTableName(), + $taskTable->getTableName(), + ManiphestTaskStatus::STATUS_CLOSED_DUPLICATE, + ManiphestTransaction::TYPE_MERGED_INTO); + + if (!$data) { + printf("No tasks closed as duplicate found\n"); + return; + } + + $dups = array(); + foreach ($data as $task) { + $query = id(new PhabricatorSavedQuery()) + ->setParameter('types', array('TASK')) + ->setParameter('limit', 10) + ->setParameter('exclude', $task['dupPHID']) + ->setParameter('operator', 'or') + ->setParameter('query', $task['dupTitle']); + + $result = $search_engine->executeSearch($query); + $full_result = $search_engine->getLastResponse(); + $dups[$task['dupPHID']]['found'] = false; + $dups[$task['dupPHID']]['hits'] = 0; + $dups[$task['dupPHID']]['avgScore'] = 0; + $dups[$task['dupPHID']]['hitScore'] = 0; + + foreach ($full_result['hits']['hits'] as $hit) { + $dups[$task['dupPHID']]['hits']++; + + if ($hit['_id'] == $task['origPHID']) { + $dups[$task['dupPHID']]['found'] = true; + $dups[$task['dupPHID']]['hitScore'] = $hit['_score']; + } + $dups[$task['dupPHID']]['scores'][] = $hit['_score']; + } + if (count($full_result['hits']['hits'])) { + $dups[$task['dupPHID']]['avgScore'] = array_sum($dups[$task['dupPHID']]['scores']) / $dups[$task['dupPHID']]['hits']; + } + } + + foreach ($dups as $phid => $result) { + printf("%s Found: %d Results: %06d Hit score: %1.4f Avg Score: %1.4f \n", $phid, $result['found'], $result['hits'], $result['hitScore'], $result['avgScore']); + } + + $found = array_sum(ipull($dups, 'found')); + $avgScores = ipull($dups, 'hitScore'); + $avgHitScore = array_sum($avgScores) / count($avgScores); + $avgOverallScores = ipull($dups, 'avgScore'); + $avgOverallScore = array_sum($avgOverallScores) / count($avgOverallScores); + printf("\nStats\nFound: %d of %d Avg hit score: %1.4f Avg overall score: %1.4f\n", $found, count($dups), $avgHitScore, $avgOverallScore); + + } +}