Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F15197245
D21128.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
11 KB
Referenced Files
None
Subscribers
None
D21128.diff
View Options
diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php
--- a/src/__phutil_library_map__.php
+++ b/src/__phutil_library_map__.php
@@ -4694,6 +4694,7 @@
'PhabricatorSearchManagementNgramsWorkflow' => 'applications/search/management/PhabricatorSearchManagementNgramsWorkflow.php',
'PhabricatorSearchManagementQueryWorkflow' => 'applications/search/management/PhabricatorSearchManagementQueryWorkflow.php',
'PhabricatorSearchManagementWorkflow' => 'applications/search/management/PhabricatorSearchManagementWorkflow.php',
+ 'PhabricatorSearchNgramEngine' => 'applications/search/engine/PhabricatorSearchNgramEngine.php',
'PhabricatorSearchNgrams' => 'applications/search/ngrams/PhabricatorSearchNgrams.php',
'PhabricatorSearchNgramsDestructionEngineExtension' => 'applications/search/engineextension/PhabricatorSearchNgramsDestructionEngineExtension.php',
'PhabricatorSearchOrderController' => 'applications/search/controller/PhabricatorSearchOrderController.php',
@@ -11417,6 +11418,7 @@
'PhabricatorSearchManagementNgramsWorkflow' => 'PhabricatorSearchManagementWorkflow',
'PhabricatorSearchManagementQueryWorkflow' => 'PhabricatorSearchManagementWorkflow',
'PhabricatorSearchManagementWorkflow' => 'PhabricatorManagementWorkflow',
+ 'PhabricatorSearchNgramEngine' => 'Phobject',
'PhabricatorSearchNgrams' => 'PhabricatorSearchDAO',
'PhabricatorSearchNgramsDestructionEngineExtension' => 'PhabricatorDestructionEngineExtension',
'PhabricatorSearchOrderController' => 'PhabricatorSearchBaseController',
diff --git a/src/applications/search/engine/PhabricatorSearchNgramEngine.php b/src/applications/search/engine/PhabricatorSearchNgramEngine.php
new file mode 100644
--- /dev/null
+++ b/src/applications/search/engine/PhabricatorSearchNgramEngine.php
@@ -0,0 +1,66 @@
+<?php
+
+final class PhabricatorSearchNgramEngine
+ extends Phobject {
+
+ public function tokenizeNgramString($value) {
+ $value = trim($value, ' ');
+ $value = preg_split('/\s+/u', $value);
+ return $value;
+ }
+
+ public function getTermNgramsFromString($string) {
+ return $this->getNgramsFromString($string, true);
+ }
+
+ public function getSubstringNgramsFromString($string) {
+ return $this->getNgramsFromString($string, false);
+ }
+
+ private function getNgramsFromString($value, $as_term) {
+ $value = phutil_utf8_strtolower($value);
+ $tokens = $this->tokenizeNgramString($value);
+
+ // First, extract unique tokens from the string. This reduces the number
+ // of `phutil_utf8v()` calls we need to make if we are indexing a large
+ // corpus with redundant terms.
+ $unique_tokens = array();
+ foreach ($tokens as $token) {
+ if ($as_term) {
+ $token = ' '.$token.' ';
+ }
+
+ $unique_tokens[$token] = true;
+ }
+
+ $ngrams = array();
+ foreach ($unique_tokens as $token => $ignored) {
+ $token_v = phutil_utf8v($token);
+ $length = count($token_v);
+
+ // NOTE: We're being somewhat clever here to micro-optimize performance,
+ // especially for very long strings. See PHI87.
+
+ $token_l = array();
+ for ($ii = 0; $ii < $length; $ii++) {
+ $token_l[$ii] = strlen($token_v[$ii]);
+ }
+
+ $ngram_count = $length - 2;
+ $cursor = 0;
+ for ($ii = 0; $ii < $ngram_count; $ii++) {
+ $ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2];
+
+ $ngram = substr($token, $cursor, $ngram_l);
+ $ngrams[$ngram] = $ngram;
+
+ $cursor += $token_l[$ii];
+ }
+ }
+
+ ksort($ngrams);
+
+ return array_keys($ngrams);
+ }
+
+}
diff --git a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php
--- a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php
+++ b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php
@@ -131,7 +131,8 @@
}
$ngrams_source = implode("\n", $ngrams_source);
- $ngrams = $engine->getTermNgramsFromString($ngrams_source);
+ $ngram_engine = new PhabricatorSearchNgramEngine();
+ $ngrams = $ngram_engine->getTermNgramsFromString($ngrams_source);
$object->openTransaction();
diff --git a/src/applications/search/ferret/PhabricatorFerretEngine.php b/src/applications/search/ferret/PhabricatorFerretEngine.php
--- a/src/applications/search/ferret/PhabricatorFerretEngine.php
+++ b/src/applications/search/ferret/PhabricatorFerretEngine.php
@@ -62,66 +62,6 @@
return new PhutilSearchStemmer();
}
- public function tokenizeString($value) {
- $value = trim($value, ' ');
- $value = preg_split('/\s+/u', $value);
- return $value;
- }
-
- public function getTermNgramsFromString($string) {
- return $this->getNgramsFromString($string, true);
- }
-
- public function getSubstringNgramsFromString($string) {
- return $this->getNgramsFromString($string, false);
- }
-
- private function getNgramsFromString($value, $as_term) {
- $value = phutil_utf8_strtolower($value);
- $tokens = $this->tokenizeString($value);
-
- // First, extract unique tokens from the string. This reduces the number
- // of `phutil_utf8v()` calls we need to make if we are indexing a large
- // corpus with redundant terms.
- $unique_tokens = array();
- foreach ($tokens as $token) {
- if ($as_term) {
- $token = ' '.$token.' ';
- }
-
- $unique_tokens[$token] = true;
- }
-
- $ngrams = array();
- foreach ($unique_tokens as $token => $ignored) {
- $token_v = phutil_utf8v($token);
- $length = count($token_v);
-
- // NOTE: We're being somewhat clever here to micro-optimize performance,
- // especially for very long strings. See PHI87.
-
- $token_l = array();
- for ($ii = 0; $ii < $length; $ii++) {
- $token_l[$ii] = strlen($token_v[$ii]);
- }
-
- $ngram_count = $length - 2;
- $cursor = 0;
- for ($ii = 0; $ii < $ngram_count; $ii++) {
- $ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2];
-
- $ngram = substr($token, $cursor, $ngram_l);
- $ngrams[$ngram] = $ngram;
-
- $cursor += $token_l[$ii];
- }
- }
-
- ksort($ngrams);
-
- return array_keys($ngrams);
- }
-
public function newTermsCorpus($raw_corpus) {
$term_corpus = strtr(
$raw_corpus,
diff --git a/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php b/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php
--- a/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php
+++ b/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php
@@ -43,10 +43,10 @@
),
);
- $engine = new ManiphestTaskFerretEngine();
+ $ngram_engine = new PhabricatorSearchNgramEngine();
foreach ($map as $input => $expect) {
- $actual = $engine->getTermNgramsFromString($input);
+ $actual = $ngram_engine->getTermNgramsFromString($input);
$this->assertEqual(
$actual,
$expect,
diff --git a/src/applications/search/ngrams/PhabricatorSearchNgrams.php b/src/applications/search/ngrams/PhabricatorSearchNgrams.php
--- a/src/applications/search/ngrams/PhabricatorSearchNgrams.php
+++ b/src/applications/search/ngrams/PhabricatorSearchNgrams.php
@@ -7,6 +7,7 @@
protected $ngram;
private $value;
+ private $ngramEngine;
abstract public function getNgramKey();
abstract public function getColumnName();
@@ -44,41 +45,10 @@
return "{$application}_{$key}_ngrams";
}
- final public function tokenizeString($value) {
- $value = trim($value, ' ');
- $value = preg_split('/ +/', $value);
- return $value;
- }
-
- final public function getNgramsFromString($value, $mode) {
- $tokens = $this->tokenizeString($value);
-
- $ngrams = array();
- foreach ($tokens as $token) {
- $token = phutil_utf8_strtolower($token);
-
- switch ($mode) {
- case 'query':
- break;
- case 'index':
- $token = ' '.$token.' ';
- break;
- }
-
- $len = (strlen($token) - 2);
- for ($ii = 0; $ii < $len; $ii++) {
- $ngram = substr($token, $ii, 3);
- $ngrams[$ngram] = $ngram;
- }
- }
-
- ksort($ngrams);
-
- return array_keys($ngrams);
- }
-
final public function writeNgram($object_id) {
- $ngrams = $this->getNgramsFromString($this->getValue(), 'index');
+ $ngram_engine = $this->getNgramEngine();
+ $ngrams = $ngram_engine->getTermNgramsFromString($this->getValue());
+
$conn_w = $this->establishConnection('w');
$sql = array();
@@ -107,4 +77,12 @@
return $this;
}
+ private function getNgramEngine() {
+ if (!$this->ngramEngine) {
+ $this->ngramEngine = new PhabricatorSearchNgramEngine();
+ }
+
+ return $this->ngramEngine;
+ }
+
}
diff --git a/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php b/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php
--- a/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php
+++ b/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php
@@ -36,6 +36,7 @@
private $ferretTables = array();
private $ferretQuery;
private $ferretMetadata = array();
+ private $ngramEngine;
const FULLTEXT_RANK = '_ft_rank';
const FULLTEXT_MODIFIED = '_ft_epochModified';
@@ -1984,6 +1985,7 @@
$stemmer = $engine->newStemmer();
$ngram_table = $engine->getNgramsTableName();
+ $ngram_engine = $this->getNgramEngine();
$flat = array();
foreach ($this->ferretTokens as $fulltext_token) {
@@ -2032,10 +2034,10 @@
}
if ($is_substring) {
- $ngrams = $engine->getSubstringNgramsFromString($value);
+ $ngrams = $ngram_engine->getSubstringNgramsFromString($value);
} else {
$terms_value = $engine->newTermsCorpus($value);
- $ngrams = $engine->getTermNgramsFromString($terms_value);
+ $ngrams = $ngram_engine->getTermNgramsFromString($terms_value);
// If this is a stemmed term, only look for ngrams present in both the
// unstemmed and stemmed variations.
@@ -2044,7 +2046,7 @@
// is (or, at least, may be) a normal word and activates.
$terms_value = trim($terms_value, ' ');
$stem_value = $stemmer->stemToken($terms_value);
- $stem_ngrams = $engine->getTermNgramsFromString($stem_value);
+ $stem_ngrams = $ngram_engine->getTermNgramsFromString($stem_value);
$ngrams = array_intersect($ngrams, $stem_ngrams);
}
}
@@ -2409,6 +2411,8 @@
protected function buildNgramsJoinClause(AphrontDatabaseConnection $conn) {
+ $ngram_engine = $this->getNgramEngine();
+
$flat = array();
foreach ($this->ngrams as $spec) {
$length = $spec['length'];
@@ -2420,7 +2424,7 @@
$index = $spec['index'];
$value = $spec['value'];
- $ngrams = $index->getNgramsFromString($value, 'query');
+ $ngrams = $ngram_engine->getSubstringNgramsFromString($value);
foreach ($ngrams as $ngram) {
$flat[] = array(
@@ -2476,6 +2480,8 @@
protected function buildNgramsWhereClause(AphrontDatabaseConnection $conn) {
$where = array();
+ $ngram_engine = $this->getNgramEngine();
+
foreach ($this->ngrams as $ngram) {
$index = $ngram['index'];
$value = $ngram['value'];
@@ -2488,7 +2494,8 @@
$column = qsprintf($conn, '%T', $column);
}
- $tokens = $index->tokenizeString($value);
+ $tokens = $ngram_engine->tokenizeNgramString($value);
+
foreach ($tokens as $token) {
$where[] = qsprintf(
$conn,
@@ -2506,6 +2513,14 @@
return (bool)$this->ngrams;
}
+ private function getNgramEngine() {
+ if (!$this->ngramEngine) {
+ $this->ngramEngine = new PhabricatorSearchNgramEngine();
+ }
+
+ return $this->ngramEngine;
+ }
+
/* -( Edge Logic )--------------------------------------------------------- */
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sun, Feb 23, 10:18 PM (10 h, 14 m)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
7187868
Default Alt Text
D21128.diff (11 KB)
Attached To
Mode
D21128: Combine the two different ngram-splitting algorithms into a single engine
Attached
Detach File
Event Timeline
Log In to Comment