Page MenuHomePhabricator

D21128.diff
No OneTemporary

D21128.diff

diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php
--- a/src/__phutil_library_map__.php
+++ b/src/__phutil_library_map__.php
@@ -4694,6 +4694,7 @@
'PhabricatorSearchManagementNgramsWorkflow' => 'applications/search/management/PhabricatorSearchManagementNgramsWorkflow.php',
'PhabricatorSearchManagementQueryWorkflow' => 'applications/search/management/PhabricatorSearchManagementQueryWorkflow.php',
'PhabricatorSearchManagementWorkflow' => 'applications/search/management/PhabricatorSearchManagementWorkflow.php',
+ 'PhabricatorSearchNgramEngine' => 'applications/search/engine/PhabricatorSearchNgramEngine.php',
'PhabricatorSearchNgrams' => 'applications/search/ngrams/PhabricatorSearchNgrams.php',
'PhabricatorSearchNgramsDestructionEngineExtension' => 'applications/search/engineextension/PhabricatorSearchNgramsDestructionEngineExtension.php',
'PhabricatorSearchOrderController' => 'applications/search/controller/PhabricatorSearchOrderController.php',
@@ -11417,6 +11418,7 @@
'PhabricatorSearchManagementNgramsWorkflow' => 'PhabricatorSearchManagementWorkflow',
'PhabricatorSearchManagementQueryWorkflow' => 'PhabricatorSearchManagementWorkflow',
'PhabricatorSearchManagementWorkflow' => 'PhabricatorManagementWorkflow',
+ 'PhabricatorSearchNgramEngine' => 'Phobject',
'PhabricatorSearchNgrams' => 'PhabricatorSearchDAO',
'PhabricatorSearchNgramsDestructionEngineExtension' => 'PhabricatorDestructionEngineExtension',
'PhabricatorSearchOrderController' => 'PhabricatorSearchBaseController',
diff --git a/src/applications/search/engine/PhabricatorSearchNgramEngine.php b/src/applications/search/engine/PhabricatorSearchNgramEngine.php
new file mode 100644
--- /dev/null
+++ b/src/applications/search/engine/PhabricatorSearchNgramEngine.php
@@ -0,0 +1,66 @@
+<?php
+
+final class PhabricatorSearchNgramEngine
+ extends Phobject {
+
+ public function tokenizeNgramString($value) {
+ $value = trim($value, ' ');
+ $value = preg_split('/\s+/u', $value);
+ return $value;
+ }
+
+ public function getTermNgramsFromString($string) {
+ return $this->getNgramsFromString($string, true);
+ }
+
+ public function getSubstringNgramsFromString($string) {
+ return $this->getNgramsFromString($string, false);
+ }
+
+ private function getNgramsFromString($value, $as_term) {
+ $value = phutil_utf8_strtolower($value);
+ $tokens = $this->tokenizeNgramString($value);
+
+ // First, extract unique tokens from the string. This reduces the number
+ // of `phutil_utf8v()` calls we need to make if we are indexing a large
+ // corpus with redundant terms.
+ $unique_tokens = array();
+ foreach ($tokens as $token) {
+ if ($as_term) {
+ $token = ' '.$token.' ';
+ }
+
+ $unique_tokens[$token] = true;
+ }
+
+ $ngrams = array();
+ foreach ($unique_tokens as $token => $ignored) {
+ $token_v = phutil_utf8v($token);
+ $length = count($token_v);
+
+ // NOTE: We're being somewhat clever here to micro-optimize performance,
+ // especially for very long strings. See PHI87.
+
+ $token_l = array();
+ for ($ii = 0; $ii < $length; $ii++) {
+ $token_l[$ii] = strlen($token_v[$ii]);
+ }
+
+ $ngram_count = $length - 2;
+ $cursor = 0;
+ for ($ii = 0; $ii < $ngram_count; $ii++) {
+ $ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2];
+
+ $ngram = substr($token, $cursor, $ngram_l);
+ $ngrams[$ngram] = $ngram;
+
+ $cursor += $token_l[$ii];
+ }
+ }
+
+ ksort($ngrams);
+
+ return array_keys($ngrams);
+ }
+
+}
diff --git a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php
--- a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php
+++ b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php
@@ -131,7 +131,8 @@
}
$ngrams_source = implode("\n", $ngrams_source);
- $ngrams = $engine->getTermNgramsFromString($ngrams_source);
+ $ngram_engine = new PhabricatorSearchNgramEngine();
+ $ngrams = $ngram_engine->getTermNgramsFromString($ngrams_source);
$object->openTransaction();
diff --git a/src/applications/search/ferret/PhabricatorFerretEngine.php b/src/applications/search/ferret/PhabricatorFerretEngine.php
--- a/src/applications/search/ferret/PhabricatorFerretEngine.php
+++ b/src/applications/search/ferret/PhabricatorFerretEngine.php
@@ -62,66 +62,6 @@
return new PhutilSearchStemmer();
}
- public function tokenizeString($value) {
- $value = trim($value, ' ');
- $value = preg_split('/\s+/u', $value);
- return $value;
- }
-
- public function getTermNgramsFromString($string) {
- return $this->getNgramsFromString($string, true);
- }
-
- public function getSubstringNgramsFromString($string) {
- return $this->getNgramsFromString($string, false);
- }
-
- private function getNgramsFromString($value, $as_term) {
- $value = phutil_utf8_strtolower($value);
- $tokens = $this->tokenizeString($value);
-
- // First, extract unique tokens from the string. This reduces the number
- // of `phutil_utf8v()` calls we need to make if we are indexing a large
- // corpus with redundant terms.
- $unique_tokens = array();
- foreach ($tokens as $token) {
- if ($as_term) {
- $token = ' '.$token.' ';
- }
-
- $unique_tokens[$token] = true;
- }
-
- $ngrams = array();
- foreach ($unique_tokens as $token => $ignored) {
- $token_v = phutil_utf8v($token);
- $length = count($token_v);
-
- // NOTE: We're being somewhat clever here to micro-optimize performance,
- // especially for very long strings. See PHI87.
-
- $token_l = array();
- for ($ii = 0; $ii < $length; $ii++) {
- $token_l[$ii] = strlen($token_v[$ii]);
- }
-
- $ngram_count = $length - 2;
- $cursor = 0;
- for ($ii = 0; $ii < $ngram_count; $ii++) {
- $ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2];
-
- $ngram = substr($token, $cursor, $ngram_l);
- $ngrams[$ngram] = $ngram;
-
- $cursor += $token_l[$ii];
- }
- }
-
- ksort($ngrams);
-
- return array_keys($ngrams);
- }
-
public function newTermsCorpus($raw_corpus) {
$term_corpus = strtr(
$raw_corpus,
diff --git a/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php b/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php
--- a/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php
+++ b/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php
@@ -43,10 +43,10 @@
),
);
- $engine = new ManiphestTaskFerretEngine();
+ $ngram_engine = new PhabricatorSearchNgramEngine();
foreach ($map as $input => $expect) {
- $actual = $engine->getTermNgramsFromString($input);
+ $actual = $ngram_engine->getTermNgramsFromString($input);
$this->assertEqual(
$actual,
$expect,
diff --git a/src/applications/search/ngrams/PhabricatorSearchNgrams.php b/src/applications/search/ngrams/PhabricatorSearchNgrams.php
--- a/src/applications/search/ngrams/PhabricatorSearchNgrams.php
+++ b/src/applications/search/ngrams/PhabricatorSearchNgrams.php
@@ -7,6 +7,7 @@
protected $ngram;
private $value;
+ private $ngramEngine;
abstract public function getNgramKey();
abstract public function getColumnName();
@@ -44,41 +45,10 @@
return "{$application}_{$key}_ngrams";
}
- final public function tokenizeString($value) {
- $value = trim($value, ' ');
- $value = preg_split('/ +/', $value);
- return $value;
- }
-
- final public function getNgramsFromString($value, $mode) {
- $tokens = $this->tokenizeString($value);
-
- $ngrams = array();
- foreach ($tokens as $token) {
- $token = phutil_utf8_strtolower($token);
-
- switch ($mode) {
- case 'query':
- break;
- case 'index':
- $token = ' '.$token.' ';
- break;
- }
-
- $len = (strlen($token) - 2);
- for ($ii = 0; $ii < $len; $ii++) {
- $ngram = substr($token, $ii, 3);
- $ngrams[$ngram] = $ngram;
- }
- }
-
- ksort($ngrams);
-
- return array_keys($ngrams);
- }
-
final public function writeNgram($object_id) {
- $ngrams = $this->getNgramsFromString($this->getValue(), 'index');
+ $ngram_engine = $this->getNgramEngine();
+ $ngrams = $ngram_engine->getTermNgramsFromString($this->getValue());
+
$conn_w = $this->establishConnection('w');
$sql = array();
@@ -107,4 +77,12 @@
return $this;
}
+ private function getNgramEngine() {
+ if (!$this->ngramEngine) {
+ $this->ngramEngine = new PhabricatorSearchNgramEngine();
+ }
+
+ return $this->ngramEngine;
+ }
+
}
diff --git a/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php b/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php
--- a/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php
+++ b/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php
@@ -36,6 +36,7 @@
private $ferretTables = array();
private $ferretQuery;
private $ferretMetadata = array();
+ private $ngramEngine;
const FULLTEXT_RANK = '_ft_rank';
const FULLTEXT_MODIFIED = '_ft_epochModified';
@@ -1984,6 +1985,7 @@
$stemmer = $engine->newStemmer();
$ngram_table = $engine->getNgramsTableName();
+ $ngram_engine = $this->getNgramEngine();
$flat = array();
foreach ($this->ferretTokens as $fulltext_token) {
@@ -2032,10 +2034,10 @@
}
if ($is_substring) {
- $ngrams = $engine->getSubstringNgramsFromString($value);
+ $ngrams = $ngram_engine->getSubstringNgramsFromString($value);
} else {
$terms_value = $engine->newTermsCorpus($value);
- $ngrams = $engine->getTermNgramsFromString($terms_value);
+ $ngrams = $ngram_engine->getTermNgramsFromString($terms_value);
// If this is a stemmed term, only look for ngrams present in both the
// unstemmed and stemmed variations.
@@ -2044,7 +2046,7 @@
// is (or, at least, may be) a normal word and activates.
$terms_value = trim($terms_value, ' ');
$stem_value = $stemmer->stemToken($terms_value);
- $stem_ngrams = $engine->getTermNgramsFromString($stem_value);
+ $stem_ngrams = $ngram_engine->getTermNgramsFromString($stem_value);
$ngrams = array_intersect($ngrams, $stem_ngrams);
}
}
@@ -2409,6 +2411,8 @@
protected function buildNgramsJoinClause(AphrontDatabaseConnection $conn) {
+ $ngram_engine = $this->getNgramEngine();
+
$flat = array();
foreach ($this->ngrams as $spec) {
$length = $spec['length'];
@@ -2420,7 +2424,7 @@
$index = $spec['index'];
$value = $spec['value'];
- $ngrams = $index->getNgramsFromString($value, 'query');
+ $ngrams = $ngram_engine->getSubstringNgramsFromString($value);
foreach ($ngrams as $ngram) {
$flat[] = array(
@@ -2476,6 +2480,8 @@
protected function buildNgramsWhereClause(AphrontDatabaseConnection $conn) {
$where = array();
+ $ngram_engine = $this->getNgramEngine();
+
foreach ($this->ngrams as $ngram) {
$index = $ngram['index'];
$value = $ngram['value'];
@@ -2488,7 +2494,8 @@
$column = qsprintf($conn, '%T', $column);
}
- $tokens = $index->tokenizeString($value);
+ $tokens = $ngram_engine->tokenizeNgramString($value);
+
foreach ($tokens as $token) {
$where[] = qsprintf(
$conn,
@@ -2506,6 +2513,14 @@
return (bool)$this->ngrams;
}
+ private function getNgramEngine() {
+ if (!$this->ngramEngine) {
+ $this->ngramEngine = new PhabricatorSearchNgramEngine();
+ }
+
+ return $this->ngramEngine;
+ }
+
/* -( Edge Logic )--------------------------------------------------------- */

File Metadata

Mime Type
text/plain
Expires
Thu, Jun 20, 9:30 AM (1 w, 4 d ago)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
6284902
Default Alt Text
D21128.diff (11 KB)

Event Timeline