diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php --- a/src/__phutil_library_map__.php +++ b/src/__phutil_library_map__.php @@ -4694,6 +4694,7 @@ 'PhabricatorSearchManagementNgramsWorkflow' => 'applications/search/management/PhabricatorSearchManagementNgramsWorkflow.php', 'PhabricatorSearchManagementQueryWorkflow' => 'applications/search/management/PhabricatorSearchManagementQueryWorkflow.php', 'PhabricatorSearchManagementWorkflow' => 'applications/search/management/PhabricatorSearchManagementWorkflow.php', + 'PhabricatorSearchNgramEngine' => 'applications/search/engine/PhabricatorSearchNgramEngine.php', 'PhabricatorSearchNgrams' => 'applications/search/ngrams/PhabricatorSearchNgrams.php', 'PhabricatorSearchNgramsDestructionEngineExtension' => 'applications/search/engineextension/PhabricatorSearchNgramsDestructionEngineExtension.php', 'PhabricatorSearchOrderController' => 'applications/search/controller/PhabricatorSearchOrderController.php', @@ -11417,6 +11418,7 @@ 'PhabricatorSearchManagementNgramsWorkflow' => 'PhabricatorSearchManagementWorkflow', 'PhabricatorSearchManagementQueryWorkflow' => 'PhabricatorSearchManagementWorkflow', 'PhabricatorSearchManagementWorkflow' => 'PhabricatorManagementWorkflow', + 'PhabricatorSearchNgramEngine' => 'Phobject', 'PhabricatorSearchNgrams' => 'PhabricatorSearchDAO', 'PhabricatorSearchNgramsDestructionEngineExtension' => 'PhabricatorDestructionEngineExtension', 'PhabricatorSearchOrderController' => 'PhabricatorSearchBaseController', diff --git a/src/applications/search/engine/PhabricatorSearchNgramEngine.php b/src/applications/search/engine/PhabricatorSearchNgramEngine.php new file mode 100644 --- /dev/null +++ b/src/applications/search/engine/PhabricatorSearchNgramEngine.php @@ -0,0 +1,66 @@ +getNgramsFromString($string, true); + } + + public function getSubstringNgramsFromString($string) { + return $this->getNgramsFromString($string, false); + } + + private function getNgramsFromString($value, $as_term) { + $value = phutil_utf8_strtolower($value); + $tokens = $this->tokenizeNgramString($value); + + // First, extract unique tokens from the string. This reduces the number + // of `phutil_utf8v()` calls we need to make if we are indexing a large + // corpus with redundant terms. + $unique_tokens = array(); + foreach ($tokens as $token) { + if ($as_term) { + $token = ' '.$token.' '; + } + + $unique_tokens[$token] = true; + } + + $ngrams = array(); + foreach ($unique_tokens as $token => $ignored) { + $token_v = phutil_utf8v($token); + $length = count($token_v); + + // NOTE: We're being somewhat clever here to micro-optimize performance, + // especially for very long strings. See PHI87. + + $token_l = array(); + for ($ii = 0; $ii < $length; $ii++) { + $token_l[$ii] = strlen($token_v[$ii]); + } + + $ngram_count = $length - 2; + $cursor = 0; + for ($ii = 0; $ii < $ngram_count; $ii++) { + $ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2]; + + $ngram = substr($token, $cursor, $ngram_l); + $ngrams[$ngram] = $ngram; + + $cursor += $token_l[$ii]; + } + } + + ksort($ngrams); + + return array_keys($ngrams); + } + +} diff --git a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php --- a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php +++ b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php @@ -131,7 +131,8 @@ } $ngrams_source = implode("\n", $ngrams_source); - $ngrams = $engine->getTermNgramsFromString($ngrams_source); + $ngram_engine = new PhabricatorSearchNgramEngine(); + $ngrams = $ngram_engine->getTermNgramsFromString($ngrams_source); $object->openTransaction(); diff --git a/src/applications/search/ferret/PhabricatorFerretEngine.php b/src/applications/search/ferret/PhabricatorFerretEngine.php --- a/src/applications/search/ferret/PhabricatorFerretEngine.php +++ b/src/applications/search/ferret/PhabricatorFerretEngine.php @@ -62,66 +62,6 @@ return new PhutilSearchStemmer(); } - public function tokenizeString($value) { - $value = trim($value, ' '); - $value = preg_split('/\s+/u', $value); - return $value; - } - - public function getTermNgramsFromString($string) { - return $this->getNgramsFromString($string, true); - } - - public function getSubstringNgramsFromString($string) { - return $this->getNgramsFromString($string, false); - } - - private function getNgramsFromString($value, $as_term) { - $value = phutil_utf8_strtolower($value); - $tokens = $this->tokenizeString($value); - - // First, extract unique tokens from the string. This reduces the number - // of `phutil_utf8v()` calls we need to make if we are indexing a large - // corpus with redundant terms. - $unique_tokens = array(); - foreach ($tokens as $token) { - if ($as_term) { - $token = ' '.$token.' '; - } - - $unique_tokens[$token] = true; - } - - $ngrams = array(); - foreach ($unique_tokens as $token => $ignored) { - $token_v = phutil_utf8v($token); - $length = count($token_v); - - // NOTE: We're being somewhat clever here to micro-optimize performance, - // especially for very long strings. See PHI87. - - $token_l = array(); - for ($ii = 0; $ii < $length; $ii++) { - $token_l[$ii] = strlen($token_v[$ii]); - } - - $ngram_count = $length - 2; - $cursor = 0; - for ($ii = 0; $ii < $ngram_count; $ii++) { - $ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2]; - - $ngram = substr($token, $cursor, $ngram_l); - $ngrams[$ngram] = $ngram; - - $cursor += $token_l[$ii]; - } - } - - ksort($ngrams); - - return array_keys($ngrams); - } - public function newTermsCorpus($raw_corpus) { $term_corpus = strtr( $raw_corpus, diff --git a/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php b/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php --- a/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php +++ b/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php @@ -43,10 +43,10 @@ ), ); - $engine = new ManiphestTaskFerretEngine(); + $ngram_engine = new PhabricatorSearchNgramEngine(); foreach ($map as $input => $expect) { - $actual = $engine->getTermNgramsFromString($input); + $actual = $ngram_engine->getTermNgramsFromString($input); $this->assertEqual( $actual, $expect, diff --git a/src/applications/search/ngrams/PhabricatorSearchNgrams.php b/src/applications/search/ngrams/PhabricatorSearchNgrams.php --- a/src/applications/search/ngrams/PhabricatorSearchNgrams.php +++ b/src/applications/search/ngrams/PhabricatorSearchNgrams.php @@ -7,6 +7,7 @@ protected $ngram; private $value; + private $ngramEngine; abstract public function getNgramKey(); abstract public function getColumnName(); @@ -44,41 +45,10 @@ return "{$application}_{$key}_ngrams"; } - final public function tokenizeString($value) { - $value = trim($value, ' '); - $value = preg_split('/ +/', $value); - return $value; - } - - final public function getNgramsFromString($value, $mode) { - $tokens = $this->tokenizeString($value); - - $ngrams = array(); - foreach ($tokens as $token) { - $token = phutil_utf8_strtolower($token); - - switch ($mode) { - case 'query': - break; - case 'index': - $token = ' '.$token.' '; - break; - } - - $len = (strlen($token) - 2); - for ($ii = 0; $ii < $len; $ii++) { - $ngram = substr($token, $ii, 3); - $ngrams[$ngram] = $ngram; - } - } - - ksort($ngrams); - - return array_keys($ngrams); - } - final public function writeNgram($object_id) { - $ngrams = $this->getNgramsFromString($this->getValue(), 'index'); + $ngram_engine = $this->getNgramEngine(); + $ngrams = $ngram_engine->getTermNgramsFromString($this->getValue()); + $conn_w = $this->establishConnection('w'); $sql = array(); @@ -107,4 +77,12 @@ return $this; } + private function getNgramEngine() { + if (!$this->ngramEngine) { + $this->ngramEngine = new PhabricatorSearchNgramEngine(); + } + + return $this->ngramEngine; + } + } diff --git a/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php b/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php --- a/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php +++ b/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php @@ -36,6 +36,7 @@ private $ferretTables = array(); private $ferretQuery; private $ferretMetadata = array(); + private $ngramEngine; const FULLTEXT_RANK = '_ft_rank'; const FULLTEXT_MODIFIED = '_ft_epochModified'; @@ -1984,6 +1985,7 @@ $stemmer = $engine->newStemmer(); $ngram_table = $engine->getNgramsTableName(); + $ngram_engine = $this->getNgramEngine(); $flat = array(); foreach ($this->ferretTokens as $fulltext_token) { @@ -2032,10 +2034,10 @@ } if ($is_substring) { - $ngrams = $engine->getSubstringNgramsFromString($value); + $ngrams = $ngram_engine->getSubstringNgramsFromString($value); } else { $terms_value = $engine->newTermsCorpus($value); - $ngrams = $engine->getTermNgramsFromString($terms_value); + $ngrams = $ngram_engine->getTermNgramsFromString($terms_value); // If this is a stemmed term, only look for ngrams present in both the // unstemmed and stemmed variations. @@ -2044,7 +2046,7 @@ // is (or, at least, may be) a normal word and activates. $terms_value = trim($terms_value, ' '); $stem_value = $stemmer->stemToken($terms_value); - $stem_ngrams = $engine->getTermNgramsFromString($stem_value); + $stem_ngrams = $ngram_engine->getTermNgramsFromString($stem_value); $ngrams = array_intersect($ngrams, $stem_ngrams); } } @@ -2409,6 +2411,8 @@ protected function buildNgramsJoinClause(AphrontDatabaseConnection $conn) { + $ngram_engine = $this->getNgramEngine(); + $flat = array(); foreach ($this->ngrams as $spec) { $length = $spec['length']; @@ -2420,7 +2424,7 @@ $index = $spec['index']; $value = $spec['value']; - $ngrams = $index->getNgramsFromString($value, 'query'); + $ngrams = $ngram_engine->getSubstringNgramsFromString($value); foreach ($ngrams as $ngram) { $flat[] = array( @@ -2476,6 +2480,8 @@ protected function buildNgramsWhereClause(AphrontDatabaseConnection $conn) { $where = array(); + $ngram_engine = $this->getNgramEngine(); + foreach ($this->ngrams as $ngram) { $index = $ngram['index']; $value = $ngram['value']; @@ -2488,7 +2494,8 @@ $column = qsprintf($conn, '%T', $column); } - $tokens = $index->tokenizeString($value); + $tokens = $ngram_engine->tokenizeNgramString($value); + foreach ($tokens as $token) { $where[] = qsprintf( $conn, @@ -2506,6 +2513,14 @@ return (bool)$this->ngrams; } + private function getNgramEngine() { + if (!$this->ngramEngine) { + $this->ngramEngine = new PhabricatorSearchNgramEngine(); + } + + return $this->ngramEngine; + } + /* -( Edge Logic )--------------------------------------------------------- */