Differential D10907 Diff 26201 src/applications/search/management/PhabricatorSearchManagementFindDuplicatesWorkflow.php
Changeset View
Changeset View
Standalone View
Standalone View
src/applications/search/management/PhabricatorSearchManagementFindDuplicatesWorkflow.php
- This file was added.
| <?php | |||||
| final class PhabricatorSearchManagementFindDuplicatesWorkflow | |||||
| extends PhabricatorSearchManagementWorkflow { | |||||
| protected function didConstruct() { | |||||
| $this | |||||
| ->setName('find_duplicates') | |||||
| ->setSynopsis('Try to find analyze how effective duplicate task search would be.') | |||||
| ->setExamples( | |||||
| "**find_duplicates**\n") | |||||
| ->setArguments( | |||||
| array( | |||||
| array( | |||||
| 'name' => 'installmapping', | |||||
| 'help' => 'Adds a mapping for the additional corpus_analyzed field', | |||||
| ), | |||||
| array( | |||||
| 'name' => 'analyzed', | |||||
| 'help' => 'Use the analyzed corpus field in elasticsearch', | |||||
| ), | |||||
| )); | |||||
| } | |||||
| public function execute(PhutilArgumentParser $args) { | |||||
| if ($args->getArg('installmapping')) { | |||||
| $this->installMapping(); | |||||
| } else { | |||||
| $this->analyzeDuplicates($args->getArg('analyzed')); | |||||
| } | |||||
| } | |||||
| private function installMapping() { | |||||
| $uri = new PhutilURI(PhabricatorEnv::getEnvConfig('search.elastic.host')); | |||||
| $uri->setPath(PhabricatorEnv::getEnvConfig('search.elastic.namespace')); | |||||
| $uri->appendPath('/_mapping/TASK'); | |||||
| $data = file_get_contents(dirname(phutil_get_library_root('phabricator')).'/resources/elasticsearch/TASK.mapping'); | |||||
| $future = new HTTPSFuture($uri, $data); | |||||
| $future->setMethod('PUT'); | |||||
| list($body) = $future->resolvex(); | |||||
| $body = json_decode($body, true); | |||||
| if (!is_array($body)) { | |||||
| throw new Exception('elasticsearch server returned invalid JSON!'); | |||||
| } | |||||
| var_dump($body); | |||||
| } | |||||
| private function analyzeDuplicates($use_analyzed = false) { | |||||
| $search_engine = id(new PhabricatorDefaultSearchEngineSelector)->newEngine(); | |||||
| if (!($search_engine instanceof PhabricatorSearchEngineElastic)) { | |||||
| throw new PhutilArgumentUsageException( | |||||
| "This script currently only works if elasticsearch is enabled as a search backend\n". | |||||
| "Please make sure that all your tasks have been indexed!"); | |||||
| } | |||||
| $transactionTable = new ManiphestTransaction(); | |||||
| $taskTable = new ManiphestTask(); | |||||
| $conn = $taskTable->establishConnection('r'); | |||||
| $data = queryfx_all( | |||||
| $conn, | |||||
| 'SELECT t.phid AS dupPHID, t.title AS dupTitle, t.description AS dupDescription, t2.phid AS origPHID, t2.title AS origTitle, t2.description AS origDescription FROM %T t, %T tx, %T t2 | |||||
| WHERE t.phid = tx.objectPHID | |||||
| AND t.status = %s | |||||
| AND tx.transactionType = %s | |||||
| AND TRIM(BOTH \'"\' FROM tx.newValue) = t2.phid | |||||
| ', | |||||
| $taskTable->getTableName(), | |||||
| $transactionTable->getTableName(), | |||||
| $taskTable->getTableName(), | |||||
| ManiphestTaskStatus::STATUS_CLOSED_DUPLICATE, | |||||
| ManiphestTransaction::TYPE_MERGED_INTO); | |||||
| $dups = array(); | |||||
| foreach ($data as $task) { | |||||
| $query = id(new PhabricatorSavedQuery()) | |||||
| ->setParameter('types', array('TASK')) | |||||
| ->setParameter('limit', 10) | |||||
| ->setParameter('exclude', $task['dupPHID']) | |||||
| ->setParameter('operator', 'or') | |||||
| ->setParameter('query', $task['dupTitle']); | |||||
| if ($use_analyzed) { | |||||
| $query->setParameter('query_field', 'field.corpus.analyzed'); | |||||
| } | |||||
| $result = $search_engine->executeSearch($query); | |||||
| $full_result = $search_engine->getLastResponse(); | |||||
| $dups[$task['dupPHID']]['found'] = false; | |||||
| $dups[$task['dupPHID']]['hits'] = 0; | |||||
| $dups[$task['dupPHID']]['avgScore'] = 0; | |||||
| $dups[$task['dupPHID']]['hitScore'] = 0; | |||||
| foreach ($full_result['hits']['hits'] as $hit) { | |||||
| $dups[$task['dupPHID']]['hits']++; | |||||
| if ($hit['_id'] == $task['origPHID']) { | |||||
| $dups[$task['dupPHID']]['found'] = true; | |||||
| $dups[$task['dupPHID']]['hitScore'] = $hit['_score']; | |||||
| } | |||||
| $dups[$task['dupPHID']]['scores'][] = $hit['_score']; | |||||
| } | |||||
| if (count($full_result['hits']['hits'])) { | |||||
| $dups[$task['dupPHID']]['avgScore'] = array_sum($dups[$task['dupPHID']]['scores']) / $dups[$task['dupPHID']]['hits']; | |||||
| } | |||||
| } | |||||
| foreach ($dups as $phid => $result) { | |||||
| printf("%s Found: %d Results: %06d Hit score: %1.4f Avg Score: %1.4f \n", $phid, $result['found'], $result['hits'], $result['hitScore'], $result['avgScore']); | |||||
| } | |||||
| $found = array_sum(ipull($dups, 'found')); | |||||
| $avgScores = ipull($dups, 'hitScore'); | |||||
| $avgHitScore = array_sum($avgScores) / count($avgScores); | |||||
| $avgOverallScores = ipull($dups, 'avgScore'); | |||||
| $avgOverallScore = array_sum($avgOverallScores) / count($avgOverallScores); | |||||
| printf("\nStats\nFound: %d of %d Avg hit score: %1.4f Avg overall score: %1.4f\n", $found, count($dups), $avgHitScore, $avgOverallScore); | |||||
| } | |||||
| } | |||||