diff --git a/externals/porter-stemmer/LICENSE b/externals/porter-stemmer/LICENSE new file mode 100644 --- /dev/null +++ b/externals/porter-stemmer/LICENSE @@ -0,0 +1,20 @@ +The MIT License (MIT) + +Copyright (c) 2005-2016 Richard Heyes (http://www.phpguru.org/) + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/externals/porter-stemmer/README.md b/externals/porter-stemmer/README.md new file mode 100644 --- /dev/null +++ b/externals/porter-stemmer/README.md @@ -0,0 +1,42 @@ +# Porter Stemmer by Richard Heyes + +# Installation (with composer) + +```json +{ + "require": { + "camspiers/porter-stemmer": "1.0.0" + } +} +``` + + $ composer install + +# Usage + +```php +$stem = Porter::Stem($word); +``` + +# License + +The MIT License (MIT) + +Copyright (c) 2005-2016 Richard Heyes (http://www.phpguru.org/) + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/externals/porter-stemmer/src/Porter.php b/externals/porter-stemmer/src/Porter.php new file mode 100644 --- /dev/null +++ b/externals/porter-stemmer/src/Porter.php @@ -0,0 +1,426 @@ + + * + * Originally available under the GPL 2 or greater. Relicensed with permission + * of original authors under the MIT License in 2016. + * + * All rights reserved. + * + * @package PorterStemmer + * @author Richard Heyes + * @author Jon Abernathy + * @copyright 2005-2016 Richard Heyes (http://www.phpguru.org/) + * @license http://www.opensource.org/licenses/mit-license.html MIT License + */ + +/** + * PHP 5 Implementation of the Porter Stemmer algorithm. Certain elements + * were borrowed from the (broken) implementation by Jon Abernathy. + * + * See http://tartarus.org/~martin/PorterStemmer/ for a description of the + * algorithm. + * + * Usage: + * + * $stem = PorterStemmer::Stem($word); + * + * How easy is that? + * + * @package PorterStemmer + * @author Richard Heyes + * @author Jon Abernathy + * @copyright 2005-2016 Richard Heyes (http://www.phpguru.org/) + * @license http://www.opensource.org/licenses/mit-license.html MIT License + */ +class Porter +{ + /** + * Regex for matching a consonant + * + * @var string + */ + private static $regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)'; + + /** + * Regex for matching a vowel + * + * @var string + */ + private static $regex_vowel = '(?:[aeiou]|(? 1) { + self::replace($word, 'e', ''); + + } elseif (self::m(substr($word, 0, -1)) == 1) { + + if (!self::cvc(substr($word, 0, -1))) { + self::replace($word, 'e', ''); + } + } + } + + // Part b + if (self::m($word) > 1 AND self::doubleConsonant($word) AND substr($word, -1) == 'l') { + $word = substr($word, 0, -1); + } + + return $word; + } + + /** + * Replaces the first string with the second, at the end of the string + * + * If third arg is given, then the preceding string must match that m + * count at least. + * + * @param string $str String to check + * @param string $check Ending to check for + * @param string $repl Replacement string + * @param int $m Optional minimum number of m() to meet + * + * @return bool Whether the $check string was at the end of the $str + * string. True does not necessarily mean that it was + * replaced. + */ + private static function replace(&$str, $check, $repl, $m = null) + { + $len = 0 - strlen($check); + + if (substr($str, $len) == $check) { + $substr = substr($str, 0, $len); + if (is_null($m) OR self::m($substr) > $m) { + $str = $substr . $repl; + } + + return true; + } + + return false; + } + + /** + * What, you mean it's not obvious from the name? + * + * m() measures the number of consonant sequences in $str. if c is + * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary + * presence, + * + * gives 0 + * vc gives 1 + * vcvc gives 2 + * vcvcvc gives 3 + * + * @param string $str The string to return the m count for + * + * @return int The m count + */ + private static function m($str) + { + $c = self::$regex_consonant; + $v = self::$regex_vowel; + + $str = preg_replace("#^$c+#", '', $str); + $str = preg_replace("#$v+$#", '', $str); + + preg_match_all("#($v+$c+)#", $str, $matches); + + return count($matches[1]); + } + + /** + * Returns true/false as to whether the given string contains two + * of the same consonant next to each other at the end of the string. + * + * @param string $str String to check + * + * @return bool Result + */ + private static function doubleConsonant($str) + { + $c = self::$regex_consonant; + + return preg_match("#$c{2}$#", $str, $matches) AND $matches[0]{0} == $matches[0]{1}; + } + + /** + * Checks for ending CVC sequence where second C is not W, X or Y + * + * @param string $str String to check + * + * @return bool Result + */ + private static function cvc($str) + { + $c = self::$regex_consonant; + $v = self::$regex_vowel; + + return preg_match("#($c$v$c)$#", $str, $matches) + AND strlen($matches[1]) == 3 + AND $matches[1]{2} != 'w' + AND $matches[1]{2} != 'x' + AND $matches[1]{2} != 'y'; + } +} diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php --- a/src/__phutil_library_map__.php +++ b/src/__phutil_library_map__.php @@ -383,6 +383,8 @@ 'PhutilSearchQueryCompiler' => 'search/PhutilSearchQueryCompiler.php', 'PhutilSearchQueryCompilerSyntaxException' => 'search/PhutilSearchQueryCompilerSyntaxException.php', 'PhutilSearchQueryCompilerTestCase' => 'search/__tests__/PhutilSearchQueryCompilerTestCase.php', + 'PhutilSearchStemmer' => 'search/PhutilSearchStemmer.php', + 'PhutilSearchStemmerTestCase' => 'search/__tests__/PhutilSearchStemmerTestCase.php', 'PhutilServiceProfiler' => 'serviceprofiler/PhutilServiceProfiler.php', 'PhutilShellLexer' => 'lexer/PhutilShellLexer.php', 'PhutilShellLexerTestCase' => 'lexer/__tests__/PhutilShellLexerTestCase.php', @@ -987,6 +989,8 @@ 'PhutilSearchQueryCompiler' => 'Phobject', 'PhutilSearchQueryCompilerSyntaxException' => 'Exception', 'PhutilSearchQueryCompilerTestCase' => 'PhutilTestCase', + 'PhutilSearchStemmer' => 'Phobject', + 'PhutilSearchStemmerTestCase' => 'PhutilTestCase', 'PhutilServiceProfiler' => 'Phobject', 'PhutilShellLexer' => 'PhutilLexer', 'PhutilShellLexerTestCase' => 'PhutilTestCase', diff --git a/src/search/PhutilSearchQueryCompiler.php b/src/search/PhutilSearchQueryCompiler.php --- a/src/search/PhutilSearchQueryCompiler.php +++ b/src/search/PhutilSearchQueryCompiler.php @@ -5,6 +5,7 @@ private $operators = '+ -><()~*:""&|'; private $query; + private $stemmer; const OPERATOR_NOT = 'not'; const OPERATOR_AND = 'and'; @@ -27,6 +28,15 @@ return $this->query; } + public function setStemmer(PhutilSearchStemmer $stemmer) { + $this->stemmer = $stemmer; + return $this; + } + + public function getStemmer() { + return $this->stemmer; + } + public function compileQuery() { $query = $this->getQuery(); $tokens = $this->tokenizeQuery($query); @@ -36,8 +46,46 @@ $result[] = $this->renderToken($token); } - $result = array_unique($result); - return implode(' ', $result); + return $this->compileRenderedTokens($result); + } + + public function compileLiteralQuery() { + $query = $this->getQuery(); + $tokens = $this->tokenizeQuery($query); + + $result = array(); + foreach ($tokens as $token) { + if (!$token['quoted']) { + continue; + } + $result[] = $this->renderToken($token); + } + + return $this->compileRenderedTokens($result); + } + + public function compileStemmedQuery() { + $query = $this->getQuery(); + $tokens = $this->tokenizeQuery($query); + + $result = array(); + foreach ($tokens as $token) { + if ($token['quoted']) { + continue; + } + $result[] = $this->renderToken($token, $this->getStemmer()); + } + + return $this->compileRenderedTokens($result); + } + + private function compileRenderedTokens(array $list) { + if (!$list) { + return null; + } + + $list = array_unique($list); + return implode(' ', $list); } private function tokenizeQuery($query) { @@ -184,8 +232,16 @@ return $results; } - private function renderToken(array $token) { - $value = $this->quoteToken($token['value']); + private function renderToken( + array $token, + PhutilSearchStemmer $stemmer = null) { + $value = $token['value']; + + if ($stemmer) { + $value = $stemmer->stemToken($value); + } + + $value = $this->quoteToken($value); $operator = $token['operator']; $prefix = $this->getOperatorPrefix($operator); diff --git a/src/search/PhutilSearchStemmer.php b/src/search/PhutilSearchStemmer.php new file mode 100644 --- /dev/null +++ b/src/search/PhutilSearchStemmer.php @@ -0,0 +1,51 @@ +normalizeToken($token); + return $this->applyStemmer($token); + } + + public function stemCorpus($corpus) { + $tokens = preg_split('/[^a-zA-Z0-9\x7F-\xFF]+/', $corpus); + + $words = array(); + foreach ($tokens as $key => $token) { + if (strlen($token) < 3) { + continue; + } + + $normal_word = $this->normalizeToken($token); + $words[$normal_word] = $normal_word; + } + + $stems = array(); + foreach ($words as $normal_word) { + $stems[] = $this->applyStemmer($normal_word); + } + + return implode(' ', $stems); + } + + private function normalizeToken($token) { + return phutil_utf8_strtolower($token); + } + + /** + * @phutil-external-symbol class Porter + */ + private function applyStemmer($normalized_token) { + static $loaded; + + if ($loaded === null) { + $root = dirname(phutil_get_library_root('phutil')); + require_once $root.'/externals/porter-stemmer/src/Porter.php'; + $loaded = true; + } + + return Porter::stem($normalized_token); + } + +} diff --git a/src/search/__tests__/PhutilSearchQueryCompilerTestCase.php b/src/search/__tests__/PhutilSearchQueryCompilerTestCase.php --- a/src/search/__tests__/PhutilSearchQueryCompilerTestCase.php +++ b/src/search/__tests__/PhutilSearchQueryCompilerTestCase.php @@ -3,10 +3,9 @@ final class PhutilSearchQueryCompilerTestCase extends PhutilTestCase { - public function testCompileQueries() { $tests = array( - '' => '', + '' => null, 'cat dog' => '+"cat" +"dog"', 'cat -dog' => '+"cat" -"dog"', 'cat-dog' => '+"cat-dog"', @@ -60,10 +59,45 @@ } - private function assertCompileQueries(array $tests, $operators = null) { + public function testCompileQueriesWithStemming() { + $stemming_tests = array( + 'cat dog' => array( + null, + '+"cat" +"dog"', + ), + 'cats dogs' => array( + null, + '+"cat" +"dog"', + ), + 'cats "dogs"' => array( + '+"dogs"', + '+"cat"', + ), + '"blessed blade" of the windseeker' => array( + '+"blessed blade"', + '+"of" +"the" +"windseek"', + ), + 'mailing users for mentions on tasks' => array( + null, + '+"mail" +"user" +"for" +"mention" +"on" +"task"', + ), + ); + + $stemmer = new PhutilSearchStemmer(); + $this->assertCompileQueries($stemming_tests, null, $stemmer); + } + + private function assertCompileQueries( + array $tests, + $operators = null, + PhutilSearchStemmer $stemmer = null) { foreach ($tests as $input => $expect) { $caught = null; + $query = null; + $literal_query = null; + $stemmed_query = null; + try { $compiler = id(new PhutilSearchQueryCompiler()) ->setQuery($input); @@ -72,19 +106,39 @@ $compiler->setOperators($operators); } - $query = $compiler->compileQuery(); + if ($stemmer !== null) { + $compiler->setStemmer($stemmer); + } + + if ($stemmer) { + $literal_query = $compiler->compileLiteralQuery(); + $stemmed_query = $compiler->compileStemmedQuery(); + } else { + $query = $compiler->compileQuery(); + } } catch (PhutilSearchQueryCompilerSyntaxException $ex) { $caught = $ex; } if ($caught !== null) { $query = false; + $literal_query = false; + $stemmed_query = false; } - $this->assertEqual( - $expect, - $query, - pht('Compilation of query: %s', $input)); + if (!$stemmer) { + $this->assertEqual( + $expect, + $query, + pht('Compilation of query: %s', $input)); + } else { + $this->assertEqual( + $expect, + ($literal_query === false) + ? false + : array($literal_query, $stemmed_query), + pht('Stemmed compilation of query: %s', $input)); + } } } diff --git a/src/search/__tests__/PhutilSearchStemmerTestCase.php b/src/search/__tests__/PhutilSearchStemmerTestCase.php new file mode 100644 --- /dev/null +++ b/src/search/__tests__/PhutilSearchStemmerTestCase.php @@ -0,0 +1,68 @@ + 'token', + 'panels' => 'panel', + + 'renames' => 'renam', + 'rename' => 'renam', + + 'components' => 'compon', + 'component' => 'compon', + + 'implementation' => 'implement', + 'implements' => 'implement', + 'implementing' => 'implement', + 'implementer' => 'implement', + + 'deleting' => 'delet', + 'deletion' => 'delet', + 'delete' => 'delet', + + 'erratically' => 'errat', + 'erratic' => 'errat', + + // Stems should be normalized. + 'DOG' => 'dog', + ); + + $stemmer = new PhutilSearchStemmer(); + foreach ($tests as $input => $expect) { + $stem = $stemmer->stemToken($input); + $this->assertEqual( + $expect, + $stem, + pht('Token stem of "%s".', $input)); + } + } + + public function testStemDocuments() { + $tests = array( + 'The wild boar meandered erratically.' => + 'the wild boar meander errat', + 'Fool me onc, shame on you. Fool me twice, shame on me.' => + 'fool onc shame you twice', + 'Fireball is a seventh-level spell which deals 2d16 points of damage '. + 'in a 1-meter radius around a target.' => + 'firebal seventh level spell which deal 2d16 point damag meter '. + 'radiu around target', + ); + + $stemmer = new PhutilSearchStemmer(); + foreach ($tests as $input => $expect) { + $stem = $stemmer->stemCorpus($input); + $this->assertEqual( + $expect, + $stem, + pht('Corpus stem of: %s', $input)); + } + } + + +}