Changeset View
Changeset View
Standalone View
Standalone View
src/applications/search/ferret/PhabricatorFerretEngine.php
| Show All 10 Lines | abstract class PhabricatorFerretEngine extends Phobject { | ||||
| } | } | ||||
| public function tokenizeString($value) { | public function tokenizeString($value) { | ||||
| $value = trim($value, ' '); | $value = trim($value, ' '); | ||||
| $value = preg_split('/ +/', $value); | $value = preg_split('/ +/', $value); | ||||
| return $value; | return $value; | ||||
| } | } | ||||
| public function getNgramsFromString($value, $mode) { | public function getTermNgramsFromString($string) { | ||||
| return $this->getNgramsFromString($string, true); | |||||
| } | |||||
| public function getSubstringNgramsFromString($string) { | |||||
| return $this->getNgramsFromString($string, false); | |||||
| } | |||||
| private function getNgramsFromString($value, $as_term) { | |||||
| $tokens = $this->tokenizeString($value); | $tokens = $this->tokenizeString($value); | ||||
| $ngrams = array(); | $ngrams = array(); | ||||
| foreach ($tokens as $token) { | foreach ($tokens as $token) { | ||||
| $token = phutil_utf8_strtolower($token); | $token = phutil_utf8_strtolower($token); | ||||
| switch ($mode) { | if ($as_term) { | ||||
| case 'query': | |||||
| break; | |||||
| case 'index': | |||||
| $token = ' '.$token.' '; | $token = ' '.$token.' '; | ||||
| break; | |||||
| case 'prefix': | |||||
| $token = ' '.$token; | |||||
| break; | |||||
| } | } | ||||
| $token_v = phutil_utf8v($token); | $token_v = phutil_utf8v($token); | ||||
| $len = (count($token_v) - 2); | $len = (count($token_v) - 2); | ||||
| for ($ii = 0; $ii < $len; $ii++) { | for ($ii = 0; $ii < $len; $ii++) { | ||||
| $ngram = array_slice($token_v, $ii, 3); | $ngram = array_slice($token_v, $ii, 3); | ||||
| $ngram = implode('', $ngram); | $ngram = implode('', $ngram); | ||||
| $ngrams[$ngram] = $ngram; | $ngrams[$ngram] = $ngram; | ||||
| ▲ Show 20 Lines • Show All 48 Lines • ▼ Show 20 Lines | public function newTermsCorpus($raw_corpus) { | ||||
| // NOTE: Single quotes divide terms only if they're at a word boundary. | // NOTE: Single quotes divide terms only if they're at a word boundary. | ||||
| // In contractions, like "whom'st've", the entire word is a single term. | // In contractions, like "whom'st've", the entire word is a single term. | ||||
| $term_corpus = preg_replace('/(^| )[\']+/', ' ', $term_corpus); | $term_corpus = preg_replace('/(^| )[\']+/', ' ', $term_corpus); | ||||
| $term_corpus = preg_replace('/[\']+( |$)/', ' ', $term_corpus); | $term_corpus = preg_replace('/[\']+( |$)/', ' ', $term_corpus); | ||||
| $term_corpus = preg_replace('/\s+/u', ' ', $term_corpus); | $term_corpus = preg_replace('/\s+/u', ' ', $term_corpus); | ||||
| $term_corpus = trim($term_corpus, ' '); | $term_corpus = trim($term_corpus, ' '); | ||||
| if (strlen($term_corpus)) { | |||||
| $term_corpus = ' '.$term_corpus.' '; | |||||
| } | |||||
| return $term_corpus; | return $term_corpus; | ||||
| } | } | ||||
| } | } | ||||