Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F15438577
D18649.id44787.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
2 KB
Referenced Files
None
Subscribers
None
D18649.id44787.diff
View Options
diff --git a/src/applications/search/ferret/PhabricatorFerretEngine.php b/src/applications/search/ferret/PhabricatorFerretEngine.php
--- a/src/applications/search/ferret/PhabricatorFerretEngine.php
+++ b/src/applications/search/ferret/PhabricatorFerretEngine.php
@@ -106,11 +106,25 @@
$ngrams = array();
foreach ($unique_tokens as $token => $ignored) {
$token_v = phutil_utf8v($token);
- $len = (count($token_v) - 2);
- for ($ii = 0; $ii < $len; $ii++) {
- $ngram = array_slice($token_v, $ii, 3);
- $ngram = implode('', $ngram);
+ $length = count($token_v);
+
+ // NOTE: We're being somewhat clever here to micro-optimize performance,
+ // especially for very long strings. See PHI87.
+
+ $token_l = array();
+ for ($ii = 0; $ii < $length; $ii++) {
+ $token_l[$ii] = strlen($token_v[$ii]);
+ }
+
+ $ngram_count = $length - 2;
+ $cursor = 0;
+ for ($ii = 0; $ii < $ngram_count; $ii++) {
+ $ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2];
+
+ $ngram = substr($token, $cursor, $ngram_l);
$ngrams[$ngram] = $ngram;
+
+ $cursor += $token_l[$ii];
}
}
diff --git a/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php b/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php
--- a/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php
+++ b/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php
@@ -24,4 +24,34 @@
}
}
+ public function testTermNgramExtraction() {
+ $snowman = "\xE2\x98\x83";
+
+ $map = array(
+ 'a' => array(' a '),
+ 'ab' => array(' ab', 'ab '),
+ 'abcdef' => array(' ab', 'abc', 'bcd', 'cde', 'def', 'ef '),
+ "{$snowman}" => array(" {$snowman} "),
+ "x{$snowman}y" => array(
+ " x{$snowman}",
+ "x{$snowman}y",
+ "{$snowman}y ",
+ ),
+ "{$snowman}{$snowman}" => array(
+ " {$snowman}{$snowman}",
+ "{$snowman}{$snowman} ",
+ ),
+ );
+
+ $engine = new ManiphestTaskFerretEngine();
+
+ foreach ($map as $input => $expect) {
+ $actual = $engine->getTermNgramsFromString($input);
+ $this->assertEqual(
+ $actual,
+ $expect,
+ pht('Term ngrams for: %s.', $input));
+ }
+ }
+
}
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Mar 27 2025, 1:43 AM (4 w, 3 d ago)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
7382036
Default Alt Text
D18649.id44787.diff (2 KB)
Attached To
Mode
D18649: Improve performance of Ferret engine ngram extraction, particularly for large input strings
Attached
Detach File
Event Timeline
Log In to Comment