Page MenuHomePhabricator

D18498.diff
No OneTemporary

D18498.diff

diff --git a/resources/sql/autopatches/20170830.ferret.02.term.sql b/resources/sql/autopatches/20170830.ferret.02.term.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20170830.ferret.02.term.sql
@@ -0,0 +1,2 @@
+ALTER TABLE {$NAMESPACE}_maniphest.maniphest_task_ffield
+ ADD termCorpus LONGTEXT NOT NULL COLLATE {$COLLATE_SORT};
diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php
--- a/src/__phutil_library_map__.php
+++ b/src/__phutil_library_map__.php
@@ -3206,6 +3206,7 @@
'PhabricatorNavigationRemarkupRule' => 'infrastructure/markup/rule/PhabricatorNavigationRemarkupRule.php',
'PhabricatorNeverTriggerClock' => 'infrastructure/daemon/workers/clock/PhabricatorNeverTriggerClock.php',
'PhabricatorNgramEngine' => 'applications/search/ngrams/PhabricatorNgramEngine.php',
+ 'PhabricatorNgramEngineTestCase' => 'applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php',
'PhabricatorNgramsIndexEngineExtension' => 'applications/search/engineextension/PhabricatorNgramsIndexEngineExtension.php',
'PhabricatorNgramsInterface' => 'applications/search/interface/PhabricatorNgramsInterface.php',
'PhabricatorNotificationBuilder' => 'applications/notification/builder/PhabricatorNotificationBuilder.php',
@@ -8587,6 +8588,7 @@
'PhabricatorNavigationRemarkupRule' => 'PhutilRemarkupRule',
'PhabricatorNeverTriggerClock' => 'PhabricatorTriggerClock',
'PhabricatorNgramEngine' => 'Phobject',
+ 'PhabricatorNgramEngineTestCase' => 'PhabricatorTestCase',
'PhabricatorNgramsIndexEngineExtension' => 'PhabricatorIndexEngineExtension',
'PhabricatorNgramsInterface' => 'PhabricatorIndexableInterface',
'PhabricatorNotificationBuilder' => 'Phobject',
diff --git a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php
--- a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php
+++ b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php
@@ -30,11 +30,13 @@
->setEpochModified(0);
$stemmer = new PhutilSearchStemmer();
+ $ngram_engine = id(new PhabricatorNgramEngine());
$key_all = PhabricatorSearchDocumentFieldType::FIELD_ALL;
$empty_template = array(
'raw' => array(),
+ 'term' => array(),
'normal' => array(),
);
@@ -49,15 +51,18 @@
}
$normal_corpus = $stemmer->stemCorpus($raw_corpus);
+ $term_corpus = $ngram_engine->newTermsCorpus($raw_corpus);
if (!isset($ferret_corpus_map[$key])) {
$ferret_corpus_map[$key] = $empty_template;
}
$ferret_corpus_map[$key]['raw'][] = $raw_corpus;
+ $ferret_corpus_map[$key]['term'][] = $term_corpus;
$ferret_corpus_map[$key]['normal'][] = $normal_corpus;
$ferret_corpus_map[$key_all]['raw'][] = $raw_corpus;
+ $ferret_corpus_map[$key_all]['term'][] = $term_corpus;
$ferret_corpus_map[$key_all]['normal'][] = $normal_corpus;
}
@@ -69,17 +74,23 @@
$normal_corpus = $fields['normal'];
$normal_corpus = implode("\n", $normal_corpus);
+ $term_corpus = $fields['term'];
+ $term_corpus = implode(' ', $term_corpus);
+ if (strlen($term_corpus)) {
+ $term_corpus = ' '.$term_corpus.' ';
+ }
+
$ferret_fields[] = $engine->newFieldObject()
->setFieldKey($key)
->setRawCorpus($raw_corpus)
+ ->setTermCorpus($term_corpus)
->setNormalCorpus($normal_corpus);
}
$ngrams_source = $ferret_corpus_map[$key_all]['raw'];
$ngrams_source = implode("\n", $ngrams_source);
- $ngrams = id(new PhabricatorNgramEngine())
- ->getNgramsFromString($ngrams_source, 'index');
+ $ngrams = $ngram_engine->getNgramsFromString($ngrams_source, 'index');
$ferret_document->openTransaction();
diff --git a/src/applications/search/ferret/PhabricatorFerretField.php b/src/applications/search/ferret/PhabricatorFerretField.php
--- a/src/applications/search/ferret/PhabricatorFerretField.php
+++ b/src/applications/search/ferret/PhabricatorFerretField.php
@@ -6,6 +6,7 @@
protected $documentID;
protected $fieldKey;
protected $rawCorpus;
+ protected $termCorpus;
protected $normalCorpus;
abstract public function getIndexKey();
@@ -17,6 +18,7 @@
'documentID' => 'uint32',
'fieldKey' => 'text4',
'rawCorpus' => 'sort',
+ 'termCorpus' => 'sort',
'normalCorpus' => 'sort',
),
self::CONFIG_KEY_SCHEMA => array(
diff --git a/src/applications/search/ngrams/PhabricatorNgramEngine.php b/src/applications/search/ngrams/PhabricatorNgramEngine.php
--- a/src/applications/search/ngrams/PhabricatorNgramEngine.php
+++ b/src/applications/search/ngrams/PhabricatorNgramEngine.php
@@ -40,4 +40,56 @@
return array_keys($ngrams);
}
+ public function newTermsCorpus($raw_corpus) {
+ $term_corpus = strtr(
+ $raw_corpus,
+ array(
+ '!' => ' ',
+ '"' => ' ',
+ '#' => ' ',
+ '$' => ' ',
+ '%' => ' ',
+ '&' => ' ',
+ '(' => ' ',
+ ')' => ' ',
+ '*' => ' ',
+ '+' => ' ',
+ ',' => ' ',
+ '-' => ' ',
+ '/' => ' ',
+ ':' => ' ',
+ ';' => ' ',
+ '<' => ' ',
+ '=' => ' ',
+ '>' => ' ',
+ '?' => ' ',
+ '@' => ' ',
+ '[' => ' ',
+ '\\' => ' ',
+ ']' => ' ',
+ '^' => ' ',
+ '`' => ' ',
+ '{' => ' ',
+ '|' => ' ',
+ '}' => ' ',
+ '~' => ' ',
+ '.' => ' ',
+ '_' => ' ',
+ "\n" => ' ',
+ "\r" => ' ',
+ "\t" => ' ',
+ ));
+
+ // NOTE: Single quotes divide terms only if they're at a word boundary.
+ // In contractions, like "whom'st've", the entire word is a single term.
+ $term_corpus = preg_replace('/(^| )[\']+/', ' ', $term_corpus);
+ $term_corpus = preg_replace('/[\']+( |$)/', ' ', $term_corpus);
+
+ $term_corpus = preg_replace('/\s+/u', ' ', $term_corpus);
+ $term_corpus = trim($term_corpus, ' ');
+
+ return $term_corpus;
+ }
+
+
}
diff --git a/src/applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php b/src/applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php
new file mode 100644
--- /dev/null
+++ b/src/applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php
@@ -0,0 +1,26 @@
+<?php
+
+final class PhabricatorNgramEngineTestCase
+ extends PhabricatorTestCase {
+
+ public function testTermsCorpus() {
+ $map = array(
+ 'Hear ye, hear ye!' => 'Hear ye hear ye',
+ "Thou whom'st've art worthy." => "Thou whom'st've art worthy",
+ 'Guaranteed to contain "food".' => 'Guaranteed to contain food',
+ 'http://example.org/path/to/file.jpg' =>
+ 'http example org path to file jpg',
+ );
+
+ $engine = new PhabricatorNgramEngine();
+ foreach ($map as $input => $expect) {
+ $actual = $engine->newTermsCorpus($input);
+
+ $this->assertEqual(
+ $expect,
+ $actual,
+ pht('Terms corpus for: %s', $input));
+ }
+ }
+
+}

File Metadata

Mime Type
text/plain
Expires
Oct 15 2024, 9:48 PM (4 w, 4 d ago)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
6714571
Default Alt Text
D18498.diff (7 KB)

Event Timeline