diff --git a/src/search/PhutilSearchStemmer.php b/src/search/PhutilSearchStemmer.php index 84ba634..c5998ca 100644 --- a/src/search/PhutilSearchStemmer.php +++ b/src/search/PhutilSearchStemmer.php @@ -1,60 +1,70 @@ normalizeToken($token); return $this->applyStemmer($token); } public function stemCorpus($corpus) { - $tokens = preg_split('/[^a-zA-Z0-9\x7F-\xFF]+/', $corpus); + $tokens = preg_split('/[^a-zA-Z0-9\x7F-\xFF._]+/', $corpus); $words = array(); foreach ($tokens as $key => $token) { + $token = trim($token, '._'); + if (strlen($token) < 3) { continue; } $normal_word = $this->normalizeToken($token); $words[$normal_word] = $normal_word; } $stems = array(); foreach ($words as $normal_word) { $stems[] = $this->applyStemmer($normal_word); } return implode(' ', $stems); } private function normalizeToken($token) { return phutil_utf8_strtolower($token); } /** * @phutil-external-symbol class Porter */ private function applyStemmer($normalized_token) { + // If the token has internal punctuation, handle it literally. This + // deals with things like domain names, Conduit API methods, and other + // sorts of informal tokens. + if (preg_match('/[._]/', $normalized_token)) { + return $normalized_token; + } + static $loaded; if ($loaded === null) { $root = dirname(phutil_get_library_root('phutil')); require_once $root.'/externals/porter-stemmer/src/Porter.php'; $loaded = true; } + $stem = Porter::stem($normalized_token); // If the stem is too short, it won't be a candidate for indexing. These // tokens are also likely to be acronyms (like "DNS") rather than real // English words. if (strlen($stem) < 3) { return $normalized_token; } return $stem; } } diff --git a/src/search/__tests__/PhutilSearchStemmerTestCase.php b/src/search/__tests__/PhutilSearchStemmerTestCase.php index e6b9692..34154d9 100644 --- a/src/search/__tests__/PhutilSearchStemmerTestCase.php +++ b/src/search/__tests__/PhutilSearchStemmerTestCase.php @@ -1,73 +1,85 @@ 'token', 'panels' => 'panel', 'renames' => 'renam', 'rename' => 'renam', 'components' => 'compon', 'component' => 'compon', 'implementation' => 'implement', 'implements' => 'implement', 'implementing' => 'implement', 'implementer' => 'implement', 'deleting' => 'delet', 'deletion' => 'delet', 'delete' => 'delet', 'erratically' => 'errat', 'erratic' => 'errat', // Stems should be normalized. 'DOG' => 'dog', // If stemming would bring a token under 3 characters, it should not // be stemmed. 'dns' => 'dns', 'nis' => 'nis', + + // Complex tokens with internal punctuation should be left untouched; + // these are usually things like domain names, API calls, informal tags, + // etc. + 'apples' => 'appl', + 'bananas' => 'banana', + 'apples_bananas' => 'apples_bananas', + 'apples_bananas.apples_bananas' => 'apples_bananas.apples_bananas', ); $stemmer = new PhutilSearchStemmer(); foreach ($tests as $input => $expect) { $stem = $stemmer->stemToken($input); $this->assertEqual( $expect, $stem, pht('Token stem of "%s".', $input)); } } public function testStemDocuments() { $tests = array( 'The wild boar meandered erratically.' => 'the wild boar meander errat', 'Fool me onc, shame on you. Fool me twice, shame on me.' => 'fool onc shame you twice', 'Fireball is a seventh-level spell which deals 2d16 points of damage '. 'in a 1-meter radius around a target.' => 'firebal seventh level spell which deal 2d16 point damag meter '. 'radiu around target', + 'apples-bananas' => 'appl banana', + 'apples_bananas' => 'apples_bananas', + 'apples.bananas' => 'apples.bananas', + 'oddly-proportioned' => 'oddli proport', ); $stemmer = new PhutilSearchStemmer(); foreach ($tests as $input => $expect) { $stem = $stemmer->stemCorpus($input); $this->assertEqual( $expect, $stem, pht('Corpus stem of: %s', $input)); } } }