Page MenuHomePhabricator

D18649.id.diff
No OneTemporary

D18649.id.diff

diff --git a/src/applications/search/ferret/PhabricatorFerretEngine.php b/src/applications/search/ferret/PhabricatorFerretEngine.php
--- a/src/applications/search/ferret/PhabricatorFerretEngine.php
+++ b/src/applications/search/ferret/PhabricatorFerretEngine.php
@@ -106,11 +106,25 @@
$ngrams = array();
foreach ($unique_tokens as $token => $ignored) {
$token_v = phutil_utf8v($token);
- $len = (count($token_v) - 2);
- for ($ii = 0; $ii < $len; $ii++) {
- $ngram = array_slice($token_v, $ii, 3);
- $ngram = implode('', $ngram);
+ $length = count($token_v);
+
+ // NOTE: We're being somewhat clever here to micro-optimize performance,
+ // especially for very long strings. See PHI87.
+
+ $token_l = array();
+ for ($ii = 0; $ii < $length; $ii++) {
+ $token_l[$ii] = strlen($token_v[$ii]);
+ }
+
+ $ngram_count = $length - 2;
+ $cursor = 0;
+ for ($ii = 0; $ii < $ngram_count; $ii++) {
+ $ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2];
+
+ $ngram = substr($token, $cursor, $ngram_l);
$ngrams[$ngram] = $ngram;
+
+ $cursor += $token_l[$ii];
}
}
diff --git a/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php b/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php
--- a/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php
+++ b/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php
@@ -24,4 +24,34 @@
}
}
+ public function testTermNgramExtraction() {
+ $snowman = "\xE2\x98\x83";
+
+ $map = array(
+ 'a' => array(' a '),
+ 'ab' => array(' ab', 'ab '),
+ 'abcdef' => array(' ab', 'abc', 'bcd', 'cde', 'def', 'ef '),
+ "{$snowman}" => array(" {$snowman} "),
+ "x{$snowman}y" => array(
+ " x{$snowman}",
+ "x{$snowman}y",
+ "{$snowman}y ",
+ ),
+ "{$snowman}{$snowman}" => array(
+ " {$snowman}{$snowman}",
+ "{$snowman}{$snowman} ",
+ ),
+ );
+
+ $engine = new ManiphestTaskFerretEngine();
+
+ foreach ($map as $input => $expect) {
+ $actual = $engine->getTermNgramsFromString($input);
+ $this->assertEqual(
+ $actual,
+ $expect,
+ pht('Term ngrams for: %s.', $input));
+ }
+ }
+
}

File Metadata

Mime Type
text/plain
Expires
Mar 25 2025, 6:43 AM (4 w, 5 d ago)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
7382036
Default Alt Text
D18649.id.diff (2 KB)

Event Timeline