diff --git a/src/search/PhutilSearchStemmer.php b/src/search/PhutilSearchStemmer.php --- a/src/search/PhutilSearchStemmer.php +++ b/src/search/PhutilSearchStemmer.php @@ -9,10 +9,12 @@ } public function stemCorpus($corpus) { - $tokens = preg_split('/[^a-zA-Z0-9\x7F-\xFF]+/', $corpus); + $tokens = preg_split('/[^a-zA-Z0-9\x7F-\xFF._]+/', $corpus); $words = array(); foreach ($tokens as $key => $token) { + $token = trim($token, '._'); + if (strlen($token) < 3) { continue; } @@ -37,6 +39,13 @@ * @phutil-external-symbol class Porter */ private function applyStemmer($normalized_token) { + // If the token has internal punctuation, handle it literally. This + // deals with things like domain names, Conduit API methods, and other + // sorts of informal tokens. + if (preg_match('/[._]/', $normalized_token)) { + return $normalized_token; + } + static $loaded; if ($loaded === null) { @@ -45,6 +54,7 @@ $loaded = true; } + $stem = Porter::stem($normalized_token); // If the stem is too short, it won't be a candidate for indexing. These diff --git a/src/search/__tests__/PhutilSearchStemmerTestCase.php b/src/search/__tests__/PhutilSearchStemmerTestCase.php --- a/src/search/__tests__/PhutilSearchStemmerTestCase.php +++ b/src/search/__tests__/PhutilSearchStemmerTestCase.php @@ -35,6 +35,14 @@ // be stemmed. 'dns' => 'dns', 'nis' => 'nis', + + // Complex tokens with internal punctuation should be left untouched; + // these are usually things like domain names, API calls, informal tags, + // etc. + 'apples' => 'appl', + 'bananas' => 'banana', + 'apples_bananas' => 'apples_bananas', + 'apples_bananas.apples_bananas' => 'apples_bananas.apples_bananas', ); $stemmer = new PhutilSearchStemmer(); @@ -57,6 +65,10 @@ 'in a 1-meter radius around a target.' => 'firebal seventh level spell which deal 2d16 point damag meter '. 'radiu around target', + 'apples-bananas' => 'appl banana', + 'apples_bananas' => 'apples_bananas', + 'apples.bananas' => 'apples.bananas', + 'oddly-proportioned' => 'oddli proport', ); $stemmer = new PhutilSearchStemmer();