Page MenuHomePhabricator

D16943.id40777.diff
No OneTemporary

D16943.id40777.diff

diff --git a/externals/porter-stemmer/LICENSE b/externals/porter-stemmer/LICENSE
new file mode 100644
--- /dev/null
+++ b/externals/porter-stemmer/LICENSE
@@ -0,0 +1,20 @@
+The MIT License (MIT)
+
+Copyright (c) 2005-2016 Richard Heyes (http://www.phpguru.org/)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/externals/porter-stemmer/README.md b/externals/porter-stemmer/README.md
new file mode 100644
--- /dev/null
+++ b/externals/porter-stemmer/README.md
@@ -0,0 +1,42 @@
+# Porter Stemmer by Richard Heyes
+
+# Installation (with composer)
+
+```json
+{
+ "require": {
+ "camspiers/porter-stemmer": "1.0.0"
+ }
+}
+```
+
+ $ composer install
+
+# Usage
+
+```php
+$stem = Porter::Stem($word);
+```
+
+# License
+
+The MIT License (MIT)
+
+Copyright (c) 2005-2016 Richard Heyes (http://www.phpguru.org/)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/externals/porter-stemmer/src/Porter.php b/externals/porter-stemmer/src/Porter.php
new file mode 100644
--- /dev/null
+++ b/externals/porter-stemmer/src/Porter.php
@@ -0,0 +1,426 @@
+<?php
+
+# vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
+
+/**
+ * Copyright (c) 2005-2016 Richard Heyes (http://www.phpguru.org/)
+ *
+ * Portions Copyright 2003-2007 Jon Abernathy <jon@chuggnutt.com>
+ *
+ * Originally available under the GPL 2 or greater. Relicensed with permission
+ * of original authors under the MIT License in 2016.
+ *
+ * All rights reserved.
+ *
+ * @package PorterStemmer
+ * @author Richard Heyes
+ * @author Jon Abernathy <jon@chuggnutt.com>
+ * @copyright 2005-2016 Richard Heyes (http://www.phpguru.org/)
+ * @license http://www.opensource.org/licenses/mit-license.html MIT License
+ */
+
+/**
+ * PHP 5 Implementation of the Porter Stemmer algorithm. Certain elements
+ * were borrowed from the (broken) implementation by Jon Abernathy.
+ *
+ * See http://tartarus.org/~martin/PorterStemmer/ for a description of the
+ * algorithm.
+ *
+ * Usage:
+ *
+ * $stem = PorterStemmer::Stem($word);
+ *
+ * How easy is that?
+ *
+ * @package PorterStemmer
+ * @author Richard Heyes
+ * @author Jon Abernathy <jon@chuggnutt.com>
+ * @copyright 2005-2016 Richard Heyes (http://www.phpguru.org/)
+ * @license http://www.opensource.org/licenses/mit-license.html MIT License
+ */
+class Porter
+{
+ /**
+ * Regex for matching a consonant
+ *
+ * @var string
+ */
+ private static $regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
+
+ /**
+ * Regex for matching a vowel
+ *
+ * @var string
+ */
+ private static $regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';
+
+ /**
+ * Stems a word. Simple huh?
+ *
+ * @param string $word Word to stem
+ *
+ * @return string Stemmed word
+ */
+ public static function Stem($word)
+ {
+ if (strlen($word) <= 2) {
+ return $word;
+ }
+
+ $word = self::step1ab($word);
+ $word = self::step1c($word);
+ $word = self::step2($word);
+ $word = self::step3($word);
+ $word = self::step4($word);
+ $word = self::step5($word);
+
+ return $word;
+ }
+
+ /**
+ * Step 1
+ */
+ private static function step1ab($word)
+ {
+ // Part a
+ if (substr($word, -1) == 's') {
+
+ self::replace($word, 'sses', 'ss')
+ OR self::replace($word, 'ies', 'i')
+ OR self::replace($word, 'ss', 'ss')
+ OR self::replace($word, 's', '');
+ }
+
+ // Part b
+ if (substr($word, -2, 1) != 'e' OR !self::replace($word, 'eed', 'ee', 0)) { // First rule
+ $v = self::$regex_vowel;
+
+ // ing and ed
+ if ( preg_match("#$v+#", substr($word, 0, -3)) && self::replace($word, 'ing', '')
+ OR preg_match("#$v+#", substr($word, 0, -2)) && self::replace($word, 'ed', '')) { // Note use of && and OR, for precedence reasons
+
+ // If one of above two test successful
+ if ( !self::replace($word, 'at', 'ate')
+ AND !self::replace($word, 'bl', 'ble')
+ AND !self::replace($word, 'iz', 'ize')) {
+
+ // Double consonant ending
+ if ( self::doubleConsonant($word)
+ AND substr($word, -2) != 'll'
+ AND substr($word, -2) != 'ss'
+ AND substr($word, -2) != 'zz') {
+
+ $word = substr($word, 0, -1);
+
+ } elseif (self::m($word) == 1 AND self::cvc($word)) {
+ $word .= 'e';
+ }
+ }
+ }
+ }
+
+ return $word;
+ }
+
+ /**
+ * Step 1c
+ *
+ * @param string $word Word to stem
+ */
+ private static function step1c($word)
+ {
+ $v = self::$regex_vowel;
+
+ if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1))) {
+ self::replace($word, 'y', 'i');
+ }
+
+ return $word;
+ }
+
+ /**
+ * Step 2
+ *
+ * @param string $word Word to stem
+ */
+ private static function step2($word)
+ {
+ switch (substr($word, -2, 1)) {
+ case 'a':
+ self::replace($word, 'ational', 'ate', 0)
+ OR self::replace($word, 'tional', 'tion', 0);
+ break;
+
+ case 'c':
+ self::replace($word, 'enci', 'ence', 0)
+ OR self::replace($word, 'anci', 'ance', 0);
+ break;
+
+ case 'e':
+ self::replace($word, 'izer', 'ize', 0);
+ break;
+
+ case 'g':
+ self::replace($word, 'logi', 'log', 0);
+ break;
+
+ case 'l':
+ self::replace($word, 'entli', 'ent', 0)
+ OR self::replace($word, 'ousli', 'ous', 0)
+ OR self::replace($word, 'alli', 'al', 0)
+ OR self::replace($word, 'bli', 'ble', 0)
+ OR self::replace($word, 'eli', 'e', 0);
+ break;
+
+ case 'o':
+ self::replace($word, 'ization', 'ize', 0)
+ OR self::replace($word, 'ation', 'ate', 0)
+ OR self::replace($word, 'ator', 'ate', 0);
+ break;
+
+ case 's':
+ self::replace($word, 'iveness', 'ive', 0)
+ OR self::replace($word, 'fulness', 'ful', 0)
+ OR self::replace($word, 'ousness', 'ous', 0)
+ OR self::replace($word, 'alism', 'al', 0);
+ break;
+
+ case 't':
+ self::replace($word, 'biliti', 'ble', 0)
+ OR self::replace($word, 'aliti', 'al', 0)
+ OR self::replace($word, 'iviti', 'ive', 0);
+ break;
+ }
+
+ return $word;
+ }
+
+ /**
+ * Step 3
+ *
+ * @param string $word String to stem
+ */
+ private static function step3($word)
+ {
+ switch (substr($word, -2, 1)) {
+ case 'a':
+ self::replace($word, 'ical', 'ic', 0);
+ break;
+
+ case 's':
+ self::replace($word, 'ness', '', 0);
+ break;
+
+ case 't':
+ self::replace($word, 'icate', 'ic', 0)
+ OR self::replace($word, 'iciti', 'ic', 0);
+ break;
+
+ case 'u':
+ self::replace($word, 'ful', '', 0);
+ break;
+
+ case 'v':
+ self::replace($word, 'ative', '', 0);
+ break;
+
+ case 'z':
+ self::replace($word, 'alize', 'al', 0);
+ break;
+ }
+
+ return $word;
+ }
+
+ /**
+ * Step 4
+ *
+ * @param string $word Word to stem
+ */
+ private static function step4($word)
+ {
+ switch (substr($word, -2, 1)) {
+ case 'a':
+ self::replace($word, 'al', '', 1);
+ break;
+
+ case 'c':
+ self::replace($word, 'ance', '', 1)
+ OR self::replace($word, 'ence', '', 1);
+ break;
+
+ case 'e':
+ self::replace($word, 'er', '', 1);
+ break;
+
+ case 'i':
+ self::replace($word, 'ic', '', 1);
+ break;
+
+ case 'l':
+ self::replace($word, 'able', '', 1)
+ OR self::replace($word, 'ible', '', 1);
+ break;
+
+ case 'n':
+ self::replace($word, 'ant', '', 1)
+ OR self::replace($word, 'ement', '', 1)
+ OR self::replace($word, 'ment', '', 1)
+ OR self::replace($word, 'ent', '', 1);
+ break;
+
+ case 'o':
+ if (substr($word, -4) == 'tion' OR substr($word, -4) == 'sion') {
+ self::replace($word, 'ion', '', 1);
+ } else {
+ self::replace($word, 'ou', '', 1);
+ }
+ break;
+
+ case 's':
+ self::replace($word, 'ism', '', 1);
+ break;
+
+ case 't':
+ self::replace($word, 'ate', '', 1)
+ OR self::replace($word, 'iti', '', 1);
+ break;
+
+ case 'u':
+ self::replace($word, 'ous', '', 1);
+ break;
+
+ case 'v':
+ self::replace($word, 'ive', '', 1);
+ break;
+
+ case 'z':
+ self::replace($word, 'ize', '', 1);
+ break;
+ }
+
+ return $word;
+ }
+
+ /**
+ * Step 5
+ *
+ * @param string $word Word to stem
+ */
+ private static function step5($word)
+ {
+ // Part a
+ if (substr($word, -1) == 'e') {
+ if (self::m(substr($word, 0, -1)) > 1) {
+ self::replace($word, 'e', '');
+
+ } elseif (self::m(substr($word, 0, -1)) == 1) {
+
+ if (!self::cvc(substr($word, 0, -1))) {
+ self::replace($word, 'e', '');
+ }
+ }
+ }
+
+ // Part b
+ if (self::m($word) > 1 AND self::doubleConsonant($word) AND substr($word, -1) == 'l') {
+ $word = substr($word, 0, -1);
+ }
+
+ return $word;
+ }
+
+ /**
+ * Replaces the first string with the second, at the end of the string
+ *
+ * If third arg is given, then the preceding string must match that m
+ * count at least.
+ *
+ * @param string $str String to check
+ * @param string $check Ending to check for
+ * @param string $repl Replacement string
+ * @param int $m Optional minimum number of m() to meet
+ *
+ * @return bool Whether the $check string was at the end of the $str
+ * string. True does not necessarily mean that it was
+ * replaced.
+ */
+ private static function replace(&$str, $check, $repl, $m = null)
+ {
+ $len = 0 - strlen($check);
+
+ if (substr($str, $len) == $check) {
+ $substr = substr($str, 0, $len);
+ if (is_null($m) OR self::m($substr) > $m) {
+ $str = $substr . $repl;
+ }
+
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * What, you mean it's not obvious from the name?
+ *
+ * m() measures the number of consonant sequences in $str. if c is
+ * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
+ * presence,
+ *
+ * <c><v> gives 0
+ * <c>vc<v> gives 1
+ * <c>vcvc<v> gives 2
+ * <c>vcvcvc<v> gives 3
+ *
+ * @param string $str The string to return the m count for
+ *
+ * @return int The m count
+ */
+ private static function m($str)
+ {
+ $c = self::$regex_consonant;
+ $v = self::$regex_vowel;
+
+ $str = preg_replace("#^$c+#", '', $str);
+ $str = preg_replace("#$v+$#", '', $str);
+
+ preg_match_all("#($v+$c+)#", $str, $matches);
+
+ return count($matches[1]);
+ }
+
+ /**
+ * Returns true/false as to whether the given string contains two
+ * of the same consonant next to each other at the end of the string.
+ *
+ * @param string $str String to check
+ *
+ * @return bool Result
+ */
+ private static function doubleConsonant($str)
+ {
+ $c = self::$regex_consonant;
+
+ return preg_match("#$c{2}$#", $str, $matches) AND $matches[0]{0} == $matches[0]{1};
+ }
+
+ /**
+ * Checks for ending CVC sequence where second C is not W, X or Y
+ *
+ * @param string $str String to check
+ *
+ * @return bool Result
+ */
+ private static function cvc($str)
+ {
+ $c = self::$regex_consonant;
+ $v = self::$regex_vowel;
+
+ return preg_match("#($c$v$c)$#", $str, $matches)
+ AND strlen($matches[1]) == 3
+ AND $matches[1]{2} != 'w'
+ AND $matches[1]{2} != 'x'
+ AND $matches[1]{2} != 'y';
+ }
+}
diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php
--- a/src/__phutil_library_map__.php
+++ b/src/__phutil_library_map__.php
@@ -383,6 +383,8 @@
'PhutilSearchQueryCompiler' => 'search/PhutilSearchQueryCompiler.php',
'PhutilSearchQueryCompilerSyntaxException' => 'search/PhutilSearchQueryCompilerSyntaxException.php',
'PhutilSearchQueryCompilerTestCase' => 'search/__tests__/PhutilSearchQueryCompilerTestCase.php',
+ 'PhutilSearchStemmer' => 'search/PhutilSearchStemmer.php',
+ 'PhutilSearchStemmerTestCase' => 'search/__tests__/PhutilSearchStemmerTestCase.php',
'PhutilServiceProfiler' => 'serviceprofiler/PhutilServiceProfiler.php',
'PhutilShellLexer' => 'lexer/PhutilShellLexer.php',
'PhutilShellLexerTestCase' => 'lexer/__tests__/PhutilShellLexerTestCase.php',
@@ -987,6 +989,8 @@
'PhutilSearchQueryCompiler' => 'Phobject',
'PhutilSearchQueryCompilerSyntaxException' => 'Exception',
'PhutilSearchQueryCompilerTestCase' => 'PhutilTestCase',
+ 'PhutilSearchStemmer' => 'Phobject',
+ 'PhutilSearchStemmerTestCase' => 'PhutilTestCase',
'PhutilServiceProfiler' => 'Phobject',
'PhutilShellLexer' => 'PhutilLexer',
'PhutilShellLexerTestCase' => 'PhutilTestCase',
diff --git a/src/search/PhutilSearchQueryCompiler.php b/src/search/PhutilSearchQueryCompiler.php
--- a/src/search/PhutilSearchQueryCompiler.php
+++ b/src/search/PhutilSearchQueryCompiler.php
@@ -5,6 +5,7 @@
private $operators = '+ -><()~*:""&|';
private $query;
+ private $stemmer;
const OPERATOR_NOT = 'not';
const OPERATOR_AND = 'and';
@@ -27,6 +28,15 @@
return $this->query;
}
+ public function setStemmer(PhutilSearchStemmer $stemmer) {
+ $this->stemmer = $stemmer;
+ return $this;
+ }
+
+ public function getStemmer() {
+ return $this->stemmer;
+ }
+
public function compileQuery() {
$query = $this->getQuery();
$tokens = $this->tokenizeQuery($query);
@@ -36,8 +46,46 @@
$result[] = $this->renderToken($token);
}
- $result = array_unique($result);
- return implode(' ', $result);
+ return $this->compileRenderedTokens($result);
+ }
+
+ public function compileLiteralQuery() {
+ $query = $this->getQuery();
+ $tokens = $this->tokenizeQuery($query);
+
+ $result = array();
+ foreach ($tokens as $token) {
+ if (!$token['quoted']) {
+ continue;
+ }
+ $result[] = $this->renderToken($token);
+ }
+
+ return $this->compileRenderedTokens($result);
+ }
+
+ public function compileStemmedQuery() {
+ $query = $this->getQuery();
+ $tokens = $this->tokenizeQuery($query);
+
+ $result = array();
+ foreach ($tokens as $token) {
+ if ($token['quoted']) {
+ continue;
+ }
+ $result[] = $this->renderToken($token, $this->getStemmer());
+ }
+
+ return $this->compileRenderedTokens($result);
+ }
+
+ private function compileRenderedTokens(array $list) {
+ if (!$list) {
+ return null;
+ }
+
+ $list = array_unique($list);
+ return implode(' ', $list);
}
private function tokenizeQuery($query) {
@@ -184,8 +232,16 @@
return $results;
}
- private function renderToken(array $token) {
- $value = $this->quoteToken($token['value']);
+ private function renderToken(
+ array $token,
+ PhutilSearchStemmer $stemmer = null) {
+ $value = $token['value'];
+
+ if ($stemmer) {
+ $value = $stemmer->stemToken($value);
+ }
+
+ $value = $this->quoteToken($value);
$operator = $token['operator'];
$prefix = $this->getOperatorPrefix($operator);
diff --git a/src/search/PhutilSearchStemmer.php b/src/search/PhutilSearchStemmer.php
new file mode 100644
--- /dev/null
+++ b/src/search/PhutilSearchStemmer.php
@@ -0,0 +1,51 @@
+<?php
+
+final class PhutilSearchStemmer
+ extends Phobject {
+
+ public function stemToken($token) {
+ $token = $this->normalizeToken($token);
+ return $this->applyStemmer($token);
+ }
+
+ public function stemCorpus($corpus) {
+ $tokens = preg_split('/[^a-zA-Z0-9\x7F-\xFF]+/', $corpus);
+
+ $words = array();
+ foreach ($tokens as $key => $token) {
+ if (strlen($token) < 3) {
+ continue;
+ }
+
+ $normal_word = $this->normalizeToken($token);
+ $words[$normal_word] = $normal_word;
+ }
+
+ $stems = array();
+ foreach ($words as $normal_word) {
+ $stems[] = $this->applyStemmer($normal_word);
+ }
+
+ return implode(' ', $stems);
+ }
+
+ private function normalizeToken($token) {
+ return phutil_utf8_strtolower($token);
+ }
+
+ /**
+ * @phutil-external-symbol class Porter
+ */
+ private function applyStemmer($normalized_token) {
+ static $loaded;
+
+ if ($loaded === null) {
+ $root = dirname(phutil_get_library_root('phutil'));
+ require_once $root.'/externals/porter-stemmer/src/Porter.php';
+ $loaded = true;
+ }
+
+ return Porter::stem($normalized_token);
+ }
+
+}
diff --git a/src/search/__tests__/PhutilSearchQueryCompilerTestCase.php b/src/search/__tests__/PhutilSearchQueryCompilerTestCase.php
--- a/src/search/__tests__/PhutilSearchQueryCompilerTestCase.php
+++ b/src/search/__tests__/PhutilSearchQueryCompilerTestCase.php
@@ -3,10 +3,9 @@
final class PhutilSearchQueryCompilerTestCase
extends PhutilTestCase {
-
public function testCompileQueries() {
$tests = array(
- '' => '',
+ '' => null,
'cat dog' => '+"cat" +"dog"',
'cat -dog' => '+"cat" -"dog"',
'cat-dog' => '+"cat-dog"',
@@ -60,10 +59,45 @@
}
- private function assertCompileQueries(array $tests, $operators = null) {
+ public function testCompileQueriesWithStemming() {
+ $stemming_tests = array(
+ 'cat dog' => array(
+ null,
+ '+"cat" +"dog"',
+ ),
+ 'cats dogs' => array(
+ null,
+ '+"cat" +"dog"',
+ ),
+ 'cats "dogs"' => array(
+ '+"dogs"',
+ '+"cat"',
+ ),
+ '"blessed blade" of the windseeker' => array(
+ '+"blessed blade"',
+ '+"of" +"the" +"windseek"',
+ ),
+ 'mailing users for mentions on tasks' => array(
+ null,
+ '+"mail" +"user" +"for" +"mention" +"on" +"task"',
+ ),
+ );
+
+ $stemmer = new PhutilSearchStemmer();
+ $this->assertCompileQueries($stemming_tests, null, $stemmer);
+ }
+
+ private function assertCompileQueries(
+ array $tests,
+ $operators = null,
+ PhutilSearchStemmer $stemmer = null) {
foreach ($tests as $input => $expect) {
$caught = null;
+ $query = null;
+ $literal_query = null;
+ $stemmed_query = null;
+
try {
$compiler = id(new PhutilSearchQueryCompiler())
->setQuery($input);
@@ -72,19 +106,39 @@
$compiler->setOperators($operators);
}
- $query = $compiler->compileQuery();
+ if ($stemmer !== null) {
+ $compiler->setStemmer($stemmer);
+ }
+
+ if ($stemmer) {
+ $literal_query = $compiler->compileLiteralQuery();
+ $stemmed_query = $compiler->compileStemmedQuery();
+ } else {
+ $query = $compiler->compileQuery();
+ }
} catch (PhutilSearchQueryCompilerSyntaxException $ex) {
$caught = $ex;
}
if ($caught !== null) {
$query = false;
+ $literal_query = false;
+ $stemmed_query = false;
}
- $this->assertEqual(
- $expect,
- $query,
- pht('Compilation of query: %s', $input));
+ if (!$stemmer) {
+ $this->assertEqual(
+ $expect,
+ $query,
+ pht('Compilation of query: %s', $input));
+ } else {
+ $this->assertEqual(
+ $expect,
+ ($literal_query === false)
+ ? false
+ : array($literal_query, $stemmed_query),
+ pht('Stemmed compilation of query: %s', $input));
+ }
}
}
diff --git a/src/search/__tests__/PhutilSearchStemmerTestCase.php b/src/search/__tests__/PhutilSearchStemmerTestCase.php
new file mode 100644
--- /dev/null
+++ b/src/search/__tests__/PhutilSearchStemmerTestCase.php
@@ -0,0 +1,68 @@
+<?php
+
+final class PhutilSearchStemmerTestCase
+ extends PhutilTestCase {
+
+ public function testStemTokens() {
+ $tests = array(
+ // Various real-world cases collected from users before we implemented
+ // stemming.
+ 'tokens' => 'token',
+ 'panels' => 'panel',
+
+ 'renames' => 'renam',
+ 'rename' => 'renam',
+
+ 'components' => 'compon',
+ 'component' => 'compon',
+
+ 'implementation' => 'implement',
+ 'implements' => 'implement',
+ 'implementing' => 'implement',
+ 'implementer' => 'implement',
+
+ 'deleting' => 'delet',
+ 'deletion' => 'delet',
+ 'delete' => 'delet',
+
+ 'erratically' => 'errat',
+ 'erratic' => 'errat',
+
+ // Stems should be normalized.
+ 'DOG' => 'dog',
+ );
+
+ $stemmer = new PhutilSearchStemmer();
+ foreach ($tests as $input => $expect) {
+ $stem = $stemmer->stemToken($input);
+ $this->assertEqual(
+ $expect,
+ $stem,
+ pht('Token stem of "%s".', $input));
+ }
+ }
+
+ public function testStemDocuments() {
+ $tests = array(
+ 'The wild boar meandered erratically.' =>
+ 'the wild boar meander errat',
+ 'Fool me onc, shame on you. Fool me twice, shame on me.' =>
+ 'fool onc shame you twice',
+ 'Fireball is a seventh-level spell which deals 2d16 points of damage '.
+ 'in a 1-meter radius around a target.' =>
+ 'firebal seventh level spell which deal 2d16 point damag meter '.
+ 'radiu around target',
+ );
+
+ $stemmer = new PhutilSearchStemmer();
+ foreach ($tests as $input => $expect) {
+ $stem = $stemmer->stemCorpus($input);
+ $this->assertEqual(
+ $expect,
+ $stem,
+ pht('Corpus stem of: %s', $input));
+ }
+ }
+
+
+}

File Metadata

Mime Type
text/plain
Expires
Tue, Feb 11, 6:52 PM (8 h, 35 m)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
7121709
Default Alt Text
D16943.id40777.diff (24 KB)

Event Timeline