Page MenuHomePhabricator

D10907.diff
No OneTemporary

D10907.diff

diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php
--- a/src/__phutil_library_map__.php
+++ b/src/__phutil_library_map__.php
@@ -2288,6 +2288,7 @@
'PhabricatorSearchField' => 'applications/search/constants/PhabricatorSearchField.php',
'PhabricatorSearchHovercardController' => 'applications/search/controller/PhabricatorSearchHovercardController.php',
'PhabricatorSearchIndexer' => 'applications/search/index/PhabricatorSearchIndexer.php',
+ 'PhabricatorSearchManagementFindDuplicatesWorkflow' => 'applications/search/management/PhabricatorSearchManagementFindDuplicatesWorkflow.php',
'PhabricatorSearchManagementIndexWorkflow' => 'applications/search/management/PhabricatorSearchManagementIndexWorkflow.php',
'PhabricatorSearchManagementWorkflow' => 'applications/search/management/PhabricatorSearchManagementWorkflow.php',
'PhabricatorSearchOrderController' => 'applications/search/controller/PhabricatorSearchOrderController.php',
@@ -5445,6 +5446,7 @@
'PhabricatorSearchEngineElastic' => 'PhabricatorSearchEngine',
'PhabricatorSearchEngineMySQL' => 'PhabricatorSearchEngine',
'PhabricatorSearchHovercardController' => 'PhabricatorSearchBaseController',
+ 'PhabricatorSearchManagementFindDuplicatesWorkflow' => 'PhabricatorSearchManagementWorkflow',
'PhabricatorSearchManagementIndexWorkflow' => 'PhabricatorSearchManagementWorkflow',
'PhabricatorSearchManagementWorkflow' => 'PhabricatorManagementWorkflow',
'PhabricatorSearchOrderController' => 'PhabricatorSearchBaseController',
diff --git a/src/applications/search/engine/PhabricatorSearchEngineElastic.php b/src/applications/search/engine/PhabricatorSearchEngineElastic.php
--- a/src/applications/search/engine/PhabricatorSearchEngineElastic.php
+++ b/src/applications/search/engine/PhabricatorSearchEngineElastic.php
@@ -4,6 +4,7 @@
private $uri;
private $index;
private $timeout;
+ private $lastResponse;
public function __construct($uri, $index) {
$this->uri = $uri;
@@ -19,6 +20,10 @@
return $this->timeout;
}
+ public function getLastResponse() {
+ return $this->lastResponse;
+ }
+
public function reindexAbstractDocument(
PhabricatorSearchAbstractDocument $doc) {
@@ -41,6 +46,7 @@
foreach ($doc->getFieldData() as $field) {
$spec['field'][] = array_combine(array('type', 'corpus', 'aux'), $field);
+
}
foreach ($doc->getRelationshipData() as $relationship) {
@@ -98,10 +104,12 @@
$filter = array();
if (strlen($query->getParameter('query'))) {
+ $query_field = $query->getParameter('query_field', 'field.corpus');
+
$spec[] = array(
'match' => array(
- 'field.corpus' => array(
- 'operator' => 'and',
+ $query_field => array(
+ 'operator' => $query->getParameter('operator', 'and'),
'query' => $query->getParameter('query'),
),
),
@@ -234,6 +242,7 @@
$response = $this->executeRequest($uri, $this->buildSpec($query));
}
+ $this->lastResponse = $response;
$phids = ipull($response['hits']['hits'], '_id');
return $phids;
}
diff --git a/src/applications/search/management/PhabricatorSearchManagementFindDuplicatesWorkflow.php b/src/applications/search/management/PhabricatorSearchManagementFindDuplicatesWorkflow.php
new file mode 100644
--- /dev/null
+++ b/src/applications/search/management/PhabricatorSearchManagementFindDuplicatesWorkflow.php
@@ -0,0 +1,93 @@
+<?php
+
+final class PhabricatorSearchManagementFindDuplicatesWorkflow
+ extends PhabricatorSearchManagementWorkflow {
+
+ protected function didConstruct() {
+ $this
+ ->setName('find_duplicates')
+ ->setSynopsis('Try to find analyze how effective duplicate task search would be.')
+ ->setExamples(
+ "**find_duplicates**\n");
+ }
+
+ public function execute(PhutilArgumentParser $args) {
+ $this->analyzeDuplicates();
+ }
+
+ private function analyzeDuplicates() {
+
+ $search_engine = id(new PhabricatorDefaultSearchEngineSelector())->newEngine();
+
+ if (!($search_engine instanceof PhabricatorSearchEngineElastic)) {
+ throw new PhutilArgumentUsageException(
+ "This script currently only works if elasticsearch is enabled as a search backend\n".
+ "Please make sure that all your tasks have been indexed!");
+ }
+
+ $transactionTable = new ManiphestTransaction();
+ $taskTable = new ManiphestTask();
+
+ $conn = $taskTable->establishConnection('r');
+ $data = queryfx_all(
+ $conn,
+ 'SELECT t.phid AS dupPHID, t.title AS dupTitle, t.description AS dupDescription, t2.phid AS origPHID, t2.title AS origTitle, t2.description AS origDescription FROM %T t, %T tx, %T t2
+ WHERE t.phid = tx.objectPHID
+ AND t.status = %s
+ AND tx.transactionType = %s
+ AND TRIM(BOTH \'"\' FROM tx.newValue) = t2.phid
+ ',
+ $taskTable->getTableName(),
+ $transactionTable->getTableName(),
+ $taskTable->getTableName(),
+ ManiphestTaskStatus::STATUS_CLOSED_DUPLICATE,
+ ManiphestTransaction::TYPE_MERGED_INTO);
+
+ if (!$data) {
+ printf("No tasks closed as duplicate found\n");
+ return;
+ }
+
+ $dups = array();
+ foreach ($data as $task) {
+ $query = id(new PhabricatorSavedQuery())
+ ->setParameter('types', array('TASK'))
+ ->setParameter('limit', 10)
+ ->setParameter('exclude', $task['dupPHID'])
+ ->setParameter('operator', 'or')
+ ->setParameter('query', $task['dupTitle']);
+
+ $result = $search_engine->executeSearch($query);
+ $full_result = $search_engine->getLastResponse();
+ $dups[$task['dupPHID']]['found'] = false;
+ $dups[$task['dupPHID']]['hits'] = 0;
+ $dups[$task['dupPHID']]['avgScore'] = 0;
+ $dups[$task['dupPHID']]['hitScore'] = 0;
+
+ foreach ($full_result['hits']['hits'] as $hit) {
+ $dups[$task['dupPHID']]['hits']++;
+
+ if ($hit['_id'] == $task['origPHID']) {
+ $dups[$task['dupPHID']]['found'] = true;
+ $dups[$task['dupPHID']]['hitScore'] = $hit['_score'];
+ }
+ $dups[$task['dupPHID']]['scores'][] = $hit['_score'];
+ }
+ if (count($full_result['hits']['hits'])) {
+ $dups[$task['dupPHID']]['avgScore'] = array_sum($dups[$task['dupPHID']]['scores']) / $dups[$task['dupPHID']]['hits'];
+ }
+ }
+
+ foreach ($dups as $phid => $result) {
+ printf("%s Found: %d Results: %06d Hit score: %1.4f Avg Score: %1.4f \n", $phid, $result['found'], $result['hits'], $result['hitScore'], $result['avgScore']);
+ }
+
+ $found = array_sum(ipull($dups, 'found'));
+ $avgScores = ipull($dups, 'hitScore');
+ $avgHitScore = array_sum($avgScores) / count($avgScores);
+ $avgOverallScores = ipull($dups, 'avgScore');
+ $avgOverallScore = array_sum($avgOverallScores) / count($avgOverallScores);
+ printf("\nStats\nFound: %d of %d Avg hit score: %1.4f Avg overall score: %1.4f\n", $found, count($dups), $avgHitScore, $avgOverallScore);
+
+ }
+}

File Metadata

Mime Type
text/plain
Expires
Sat, Jan 18, 7:08 AM (6 h, 2 m)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
7000620
Default Alt Text
D10907.diff (6 KB)

Event Timeline