Changeset View
Changeset View
Standalone View
Standalone View
src/applications/search/ferret/PhabricatorFerretEngine.php
Show All 10 Lines | abstract class PhabricatorFerretEngine extends Phobject { | ||||
} | } | ||||
public function tokenizeString($value) { | public function tokenizeString($value) { | ||||
$value = trim($value, ' '); | $value = trim($value, ' '); | ||||
$value = preg_split('/ +/', $value); | $value = preg_split('/ +/', $value); | ||||
return $value; | return $value; | ||||
} | } | ||||
public function getNgramsFromString($value, $mode) { | public function getTermNgramsFromString($string) { | ||||
return $this->getNgramsFromString($string, true); | |||||
} | |||||
public function getSubstringNgramsFromString($string) { | |||||
return $this->getNgramsFromString($string, false); | |||||
} | |||||
private function getNgramsFromString($value, $as_term) { | |||||
$tokens = $this->tokenizeString($value); | $tokens = $this->tokenizeString($value); | ||||
$ngrams = array(); | $ngrams = array(); | ||||
foreach ($tokens as $token) { | foreach ($tokens as $token) { | ||||
$token = phutil_utf8_strtolower($token); | $token = phutil_utf8_strtolower($token); | ||||
switch ($mode) { | if ($as_term) { | ||||
case 'query': | |||||
break; | |||||
case 'index': | |||||
$token = ' '.$token.' '; | $token = ' '.$token.' '; | ||||
break; | |||||
case 'prefix': | |||||
$token = ' '.$token; | |||||
break; | |||||
} | } | ||||
$token_v = phutil_utf8v($token); | $token_v = phutil_utf8v($token); | ||||
$len = (count($token_v) - 2); | $len = (count($token_v) - 2); | ||||
for ($ii = 0; $ii < $len; $ii++) { | for ($ii = 0; $ii < $len; $ii++) { | ||||
$ngram = array_slice($token_v, $ii, 3); | $ngram = array_slice($token_v, $ii, 3); | ||||
$ngram = implode('', $ngram); | $ngram = implode('', $ngram); | ||||
$ngrams[$ngram] = $ngram; | $ngrams[$ngram] = $ngram; | ||||
▲ Show 20 Lines • Show All 48 Lines • ▼ Show 20 Lines | public function newTermsCorpus($raw_corpus) { | ||||
// NOTE: Single quotes divide terms only if they're at a word boundary. | // NOTE: Single quotes divide terms only if they're at a word boundary. | ||||
// In contractions, like "whom'st've", the entire word is a single term. | // In contractions, like "whom'st've", the entire word is a single term. | ||||
$term_corpus = preg_replace('/(^| )[\']+/', ' ', $term_corpus); | $term_corpus = preg_replace('/(^| )[\']+/', ' ', $term_corpus); | ||||
$term_corpus = preg_replace('/[\']+( |$)/', ' ', $term_corpus); | $term_corpus = preg_replace('/[\']+( |$)/', ' ', $term_corpus); | ||||
$term_corpus = preg_replace('/\s+/u', ' ', $term_corpus); | $term_corpus = preg_replace('/\s+/u', ' ', $term_corpus); | ||||
$term_corpus = trim($term_corpus, ' '); | $term_corpus = trim($term_corpus, ' '); | ||||
if (strlen($term_corpus)) { | |||||
$term_corpus = ' '.$term_corpus.' '; | |||||
} | |||||
return $term_corpus; | return $term_corpus; | ||||
} | } | ||||
} | } |