Differential D10907 Diff 26454 src/applications/search/management/PhabricatorSearchManagementFindDuplicatesWorkflow.php
Changeset View
Changeset View
Standalone View
Standalone View
src/applications/search/management/PhabricatorSearchManagementFindDuplicatesWorkflow.php
- This file was added.
<?php | |||||
final class PhabricatorSearchManagementFindDuplicatesWorkflow | |||||
extends PhabricatorSearchManagementWorkflow { | |||||
protected function didConstruct() { | |||||
$this | |||||
->setName('find_duplicates') | |||||
->setSynopsis('Try to find analyze how effective duplicate task search would be.') | |||||
->setExamples( | |||||
"**find_duplicates**\n"); | |||||
} | |||||
public function execute(PhutilArgumentParser $args) { | |||||
$this->analyzeDuplicates(); | |||||
} | |||||
private function analyzeDuplicates() { | |||||
$search_engine = id(new PhabricatorDefaultSearchEngineSelector())->newEngine(); | |||||
if (!($search_engine instanceof PhabricatorSearchEngineElastic)) { | |||||
throw new PhutilArgumentUsageException( | |||||
"This script currently only works if elasticsearch is enabled as a search backend\n". | |||||
"Please make sure that all your tasks have been indexed!"); | |||||
} | |||||
$transactionTable = new ManiphestTransaction(); | |||||
$taskTable = new ManiphestTask(); | |||||
$conn = $taskTable->establishConnection('r'); | |||||
$data = queryfx_all( | |||||
$conn, | |||||
'SELECT t.phid AS dupPHID, t.title AS dupTitle, t.description AS dupDescription, t2.phid AS origPHID, t2.title AS origTitle, t2.description AS origDescription FROM %T t, %T tx, %T t2 | |||||
WHERE t.phid = tx.objectPHID | |||||
AND t.status = %s | |||||
AND tx.transactionType = %s | |||||
AND TRIM(BOTH \'"\' FROM tx.newValue) = t2.phid | |||||
', | |||||
$taskTable->getTableName(), | |||||
$transactionTable->getTableName(), | |||||
$taskTable->getTableName(), | |||||
ManiphestTaskStatus::STATUS_CLOSED_DUPLICATE, | |||||
ManiphestTransaction::TYPE_MERGED_INTO); | |||||
if (!$data) { | |||||
printf("No tasks closed as duplicate found\n"); | |||||
return; | |||||
} | |||||
$dups = array(); | |||||
foreach ($data as $task) { | |||||
$query = id(new PhabricatorSavedQuery()) | |||||
->setParameter('types', array('TASK')) | |||||
->setParameter('limit', 10) | |||||
->setParameter('exclude', $task['dupPHID']) | |||||
->setParameter('operator', 'or') | |||||
->setParameter('query', $task['dupTitle']); | |||||
$result = $search_engine->executeSearch($query); | |||||
$full_result = $search_engine->getLastResponse(); | |||||
$dups[$task['dupPHID']]['found'] = false; | |||||
$dups[$task['dupPHID']]['hits'] = 0; | |||||
$dups[$task['dupPHID']]['avgScore'] = 0; | |||||
$dups[$task['dupPHID']]['hitScore'] = 0; | |||||
foreach ($full_result['hits']['hits'] as $hit) { | |||||
$dups[$task['dupPHID']]['hits']++; | |||||
if ($hit['_id'] == $task['origPHID']) { | |||||
$dups[$task['dupPHID']]['found'] = true; | |||||
$dups[$task['dupPHID']]['hitScore'] = $hit['_score']; | |||||
} | |||||
$dups[$task['dupPHID']]['scores'][] = $hit['_score']; | |||||
} | |||||
if (count($full_result['hits']['hits'])) { | |||||
$dups[$task['dupPHID']]['avgScore'] = array_sum($dups[$task['dupPHID']]['scores']) / $dups[$task['dupPHID']]['hits']; | |||||
} | |||||
} | |||||
foreach ($dups as $phid => $result) { | |||||
printf("%s Found: %d Results: %06d Hit score: %1.4f Avg Score: %1.4f \n", $phid, $result['found'], $result['hits'], $result['hitScore'], $result['avgScore']); | |||||
} | |||||
$found = array_sum(ipull($dups, 'found')); | |||||
$avgScores = ipull($dups, 'hitScore'); | |||||
$avgHitScore = array_sum($avgScores) / count($avgScores); | |||||
$avgOverallScores = ipull($dups, 'avgScore'); | |||||
$avgOverallScore = array_sum($avgOverallScores) / count($avgOverallScores); | |||||
printf("\nStats\nFound: %d of %d Avg hit score: %1.4f Avg overall score: %1.4f\n", $found, count($dups), $avgHitScore, $avgOverallScore); | |||||
} | |||||
} |