Differential D10907 Diff 26201 src/applications/search/management/PhabricatorSearchManagementFindDuplicatesWorkflow.php
Changeset View
Changeset View
Standalone View
Standalone View
src/applications/search/management/PhabricatorSearchManagementFindDuplicatesWorkflow.php
- This file was added.
<?php | |||||
final class PhabricatorSearchManagementFindDuplicatesWorkflow | |||||
extends PhabricatorSearchManagementWorkflow { | |||||
protected function didConstruct() { | |||||
$this | |||||
->setName('find_duplicates') | |||||
->setSynopsis('Try to find analyze how effective duplicate task search would be.') | |||||
->setExamples( | |||||
"**find_duplicates**\n") | |||||
->setArguments( | |||||
array( | |||||
array( | |||||
'name' => 'installmapping', | |||||
'help' => 'Adds a mapping for the additional corpus_analyzed field', | |||||
), | |||||
array( | |||||
'name' => 'analyzed', | |||||
'help' => 'Use the analyzed corpus field in elasticsearch', | |||||
), | |||||
)); | |||||
} | |||||
public function execute(PhutilArgumentParser $args) { | |||||
if ($args->getArg('installmapping')) { | |||||
$this->installMapping(); | |||||
} else { | |||||
$this->analyzeDuplicates($args->getArg('analyzed')); | |||||
} | |||||
} | |||||
private function installMapping() { | |||||
$uri = new PhutilURI(PhabricatorEnv::getEnvConfig('search.elastic.host')); | |||||
$uri->setPath(PhabricatorEnv::getEnvConfig('search.elastic.namespace')); | |||||
$uri->appendPath('/_mapping/TASK'); | |||||
$data = file_get_contents(dirname(phutil_get_library_root('phabricator')).'/resources/elasticsearch/TASK.mapping'); | |||||
$future = new HTTPSFuture($uri, $data); | |||||
$future->setMethod('PUT'); | |||||
list($body) = $future->resolvex(); | |||||
$body = json_decode($body, true); | |||||
if (!is_array($body)) { | |||||
throw new Exception('elasticsearch server returned invalid JSON!'); | |||||
} | |||||
var_dump($body); | |||||
} | |||||
private function analyzeDuplicates($use_analyzed = false) { | |||||
$search_engine = id(new PhabricatorDefaultSearchEngineSelector)->newEngine(); | |||||
if (!($search_engine instanceof PhabricatorSearchEngineElastic)) { | |||||
throw new PhutilArgumentUsageException( | |||||
"This script currently only works if elasticsearch is enabled as a search backend\n". | |||||
"Please make sure that all your tasks have been indexed!"); | |||||
} | |||||
$transactionTable = new ManiphestTransaction(); | |||||
$taskTable = new ManiphestTask(); | |||||
$conn = $taskTable->establishConnection('r'); | |||||
$data = queryfx_all( | |||||
$conn, | |||||
'SELECT t.phid AS dupPHID, t.title AS dupTitle, t.description AS dupDescription, t2.phid AS origPHID, t2.title AS origTitle, t2.description AS origDescription FROM %T t, %T tx, %T t2 | |||||
WHERE t.phid = tx.objectPHID | |||||
AND t.status = %s | |||||
AND tx.transactionType = %s | |||||
AND TRIM(BOTH \'"\' FROM tx.newValue) = t2.phid | |||||
', | |||||
$taskTable->getTableName(), | |||||
$transactionTable->getTableName(), | |||||
$taskTable->getTableName(), | |||||
ManiphestTaskStatus::STATUS_CLOSED_DUPLICATE, | |||||
ManiphestTransaction::TYPE_MERGED_INTO); | |||||
$dups = array(); | |||||
foreach ($data as $task) { | |||||
$query = id(new PhabricatorSavedQuery()) | |||||
->setParameter('types', array('TASK')) | |||||
->setParameter('limit', 10) | |||||
->setParameter('exclude', $task['dupPHID']) | |||||
->setParameter('operator', 'or') | |||||
->setParameter('query', $task['dupTitle']); | |||||
if ($use_analyzed) { | |||||
$query->setParameter('query_field', 'field.corpus.analyzed'); | |||||
} | |||||
$result = $search_engine->executeSearch($query); | |||||
$full_result = $search_engine->getLastResponse(); | |||||
$dups[$task['dupPHID']]['found'] = false; | |||||
$dups[$task['dupPHID']]['hits'] = 0; | |||||
$dups[$task['dupPHID']]['avgScore'] = 0; | |||||
$dups[$task['dupPHID']]['hitScore'] = 0; | |||||
foreach ($full_result['hits']['hits'] as $hit) { | |||||
$dups[$task['dupPHID']]['hits']++; | |||||
if ($hit['_id'] == $task['origPHID']) { | |||||
$dups[$task['dupPHID']]['found'] = true; | |||||
$dups[$task['dupPHID']]['hitScore'] = $hit['_score']; | |||||
} | |||||
$dups[$task['dupPHID']]['scores'][] = $hit['_score']; | |||||
} | |||||
if (count($full_result['hits']['hits'])) { | |||||
$dups[$task['dupPHID']]['avgScore'] = array_sum($dups[$task['dupPHID']]['scores']) / $dups[$task['dupPHID']]['hits']; | |||||
} | |||||
} | |||||
foreach ($dups as $phid => $result) { | |||||
printf("%s Found: %d Results: %06d Hit score: %1.4f Avg Score: %1.4f \n", $phid, $result['found'], $result['hits'], $result['hitScore'], $result['avgScore']); | |||||
} | |||||
$found = array_sum(ipull($dups, 'found')); | |||||
$avgScores = ipull($dups, 'hitScore'); | |||||
$avgHitScore = array_sum($avgScores) / count($avgScores); | |||||
$avgOverallScores = ipull($dups, 'avgScore'); | |||||
$avgOverallScore = array_sum($avgOverallScores) / count($avgOverallScores); | |||||
printf("\nStats\nFound: %d of %d Avg hit score: %1.4f Avg overall score: %1.4f\n", $found, count($dups), $avgHitScore, $avgOverallScore); | |||||
} | |||||
} |