diff --git a/src/search/PhutilSearchStemmer.php b/src/search/PhutilSearchStemmer.php index c5998ca..c4a8604 100644 --- a/src/search/PhutilSearchStemmer.php +++ b/src/search/PhutilSearchStemmer.php @@ -1,70 +1,74 @@ normalizeToken($token); return $this->applyStemmer($token); } public function stemCorpus($corpus) { + $corpus = $this->normalizeCorpus($corpus); $tokens = preg_split('/[^a-zA-Z0-9\x7F-\xFF._]+/', $corpus); $words = array(); foreach ($tokens as $key => $token) { $token = trim($token, '._'); if (strlen($token) < 3) { continue; } - $normal_word = $this->normalizeToken($token); - $words[$normal_word] = $normal_word; + $words[$token] = $token; } $stems = array(); - foreach ($words as $normal_word) { - $stems[] = $this->applyStemmer($normal_word); + foreach ($words as $word) { + $stems[] = $this->applyStemmer($word); } return implode(' ', $stems); } private function normalizeToken($token) { return phutil_utf8_strtolower($token); } + private function normalizeCorpus($corpus) { + return phutil_utf8_strtolower($corpus); + } + /** * @phutil-external-symbol class Porter */ private function applyStemmer($normalized_token) { // If the token has internal punctuation, handle it literally. This // deals with things like domain names, Conduit API methods, and other // sorts of informal tokens. if (preg_match('/[._]/', $normalized_token)) { return $normalized_token; } static $loaded; if ($loaded === null) { $root = dirname(phutil_get_library_root('phutil')); require_once $root.'/externals/porter-stemmer/src/Porter.php'; $loaded = true; } $stem = Porter::stem($normalized_token); // If the stem is too short, it won't be a candidate for indexing. These // tokens are also likely to be acronyms (like "DNS") rather than real // English words. if (strlen($stem) < 3) { return $normalized_token; } return $stem; } }