Page MenuHomePhabricator

D20939.diff
No OneTemporary

D20939.diff

diff --git a/externals/porter-stemmer/LICENSE b/externals/porter-stemmer/LICENSE
new file mode 100644
--- /dev/null
+++ b/externals/porter-stemmer/LICENSE
@@ -0,0 +1,20 @@
+The MIT License (MIT)
+
+Copyright (c) 2005-2016 Richard Heyes (http://www.phpguru.org/)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/externals/porter-stemmer/README.md b/externals/porter-stemmer/README.md
new file mode 100644
--- /dev/null
+++ b/externals/porter-stemmer/README.md
@@ -0,0 +1,42 @@
+# Porter Stemmer by Richard Heyes
+
+# Installation (with composer)
+
+```json
+{
+ "require": {
+ "camspiers/porter-stemmer": "1.0.0"
+ }
+}
+```
+
+ $ composer install
+
+# Usage
+
+```php
+$stem = Porter::Stem($word);
+```
+
+# License
+
+The MIT License (MIT)
+
+Copyright (c) 2005-2016 Richard Heyes (http://www.phpguru.org/)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/externals/porter-stemmer/src/Porter.php b/externals/porter-stemmer/src/Porter.php
new file mode 100644
--- /dev/null
+++ b/externals/porter-stemmer/src/Porter.php
@@ -0,0 +1,426 @@
+<?php
+
+# vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
+
+/**
+ * Copyright (c) 2005-2016 Richard Heyes (http://www.phpguru.org/)
+ *
+ * Portions Copyright 2003-2007 Jon Abernathy <jon@chuggnutt.com>
+ *
+ * Originally available under the GPL 2 or greater. Relicensed with permission
+ * of original authors under the MIT License in 2016.
+ *
+ * All rights reserved.
+ *
+ * @package PorterStemmer
+ * @author Richard Heyes
+ * @author Jon Abernathy <jon@chuggnutt.com>
+ * @copyright 2005-2016 Richard Heyes (http://www.phpguru.org/)
+ * @license http://www.opensource.org/licenses/mit-license.html MIT License
+ */
+
+/**
+ * PHP 5 Implementation of the Porter Stemmer algorithm. Certain elements
+ * were borrowed from the (broken) implementation by Jon Abernathy.
+ *
+ * See http://tartarus.org/~martin/PorterStemmer/ for a description of the
+ * algorithm.
+ *
+ * Usage:
+ *
+ * $stem = PorterStemmer::Stem($word);
+ *
+ * How easy is that?
+ *
+ * @package PorterStemmer
+ * @author Richard Heyes
+ * @author Jon Abernathy <jon@chuggnutt.com>
+ * @copyright 2005-2016 Richard Heyes (http://www.phpguru.org/)
+ * @license http://www.opensource.org/licenses/mit-license.html MIT License
+ */
+class Porter
+{
+ /**
+ * Regex for matching a consonant
+ *
+ * @var string
+ */
+ private static $regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
+
+ /**
+ * Regex for matching a vowel
+ *
+ * @var string
+ */
+ private static $regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';
+
+ /**
+ * Stems a word. Simple huh?
+ *
+ * @param string $word Word to stem
+ *
+ * @return string Stemmed word
+ */
+ public static function Stem($word)
+ {
+ if (strlen($word) <= 2) {
+ return $word;
+ }
+
+ $word = self::step1ab($word);
+ $word = self::step1c($word);
+ $word = self::step2($word);
+ $word = self::step3($word);
+ $word = self::step4($word);
+ $word = self::step5($word);
+
+ return $word;
+ }
+
+ /**
+ * Step 1
+ */
+ private static function step1ab($word)
+ {
+ // Part a
+ if (substr($word, -1) == 's') {
+
+ self::replace($word, 'sses', 'ss')
+ OR self::replace($word, 'ies', 'i')
+ OR self::replace($word, 'ss', 'ss')
+ OR self::replace($word, 's', '');
+ }
+
+ // Part b
+ if (substr($word, -2, 1) != 'e' OR !self::replace($word, 'eed', 'ee', 0)) { // First rule
+ $v = self::$regex_vowel;
+
+ // ing and ed
+ if ( preg_match("#$v+#", substr($word, 0, -3)) && self::replace($word, 'ing', '')
+ OR preg_match("#$v+#", substr($word, 0, -2)) && self::replace($word, 'ed', '')) { // Note use of && and OR, for precedence reasons
+
+ // If one of above two test successful
+ if ( !self::replace($word, 'at', 'ate')
+ AND !self::replace($word, 'bl', 'ble')
+ AND !self::replace($word, 'iz', 'ize')) {
+
+ // Double consonant ending
+ if ( self::doubleConsonant($word)
+ AND substr($word, -2) != 'll'
+ AND substr($word, -2) != 'ss'
+ AND substr($word, -2) != 'zz') {
+
+ $word = substr($word, 0, -1);
+
+ } elseif (self::m($word) == 1 AND self::cvc($word)) {
+ $word .= 'e';
+ }
+ }
+ }
+ }
+
+ return $word;
+ }
+
+ /**
+ * Step 1c
+ *
+ * @param string $word Word to stem
+ */
+ private static function step1c($word)
+ {
+ $v = self::$regex_vowel;
+
+ if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1))) {
+ self::replace($word, 'y', 'i');
+ }
+
+ return $word;
+ }
+
+ /**
+ * Step 2
+ *
+ * @param string $word Word to stem
+ */
+ private static function step2($word)
+ {
+ switch (substr($word, -2, 1)) {
+ case 'a':
+ self::replace($word, 'ational', 'ate', 0)
+ OR self::replace($word, 'tional', 'tion', 0);
+ break;
+
+ case 'c':
+ self::replace($word, 'enci', 'ence', 0)
+ OR self::replace($word, 'anci', 'ance', 0);
+ break;
+
+ case 'e':
+ self::replace($word, 'izer', 'ize', 0);
+ break;
+
+ case 'g':
+ self::replace($word, 'logi', 'log', 0);
+ break;
+
+ case 'l':
+ self::replace($word, 'entli', 'ent', 0)
+ OR self::replace($word, 'ousli', 'ous', 0)
+ OR self::replace($word, 'alli', 'al', 0)
+ OR self::replace($word, 'bli', 'ble', 0)
+ OR self::replace($word, 'eli', 'e', 0);
+ break;
+
+ case 'o':
+ self::replace($word, 'ization', 'ize', 0)
+ OR self::replace($word, 'ation', 'ate', 0)
+ OR self::replace($word, 'ator', 'ate', 0);
+ break;
+
+ case 's':
+ self::replace($word, 'iveness', 'ive', 0)
+ OR self::replace($word, 'fulness', 'ful', 0)
+ OR self::replace($word, 'ousness', 'ous', 0)
+ OR self::replace($word, 'alism', 'al', 0);
+ break;
+
+ case 't':
+ self::replace($word, 'biliti', 'ble', 0)
+ OR self::replace($word, 'aliti', 'al', 0)
+ OR self::replace($word, 'iviti', 'ive', 0);
+ break;
+ }
+
+ return $word;
+ }
+
+ /**
+ * Step 3
+ *
+ * @param string $word String to stem
+ */
+ private static function step3($word)
+ {
+ switch (substr($word, -2, 1)) {
+ case 'a':
+ self::replace($word, 'ical', 'ic', 0);
+ break;
+
+ case 's':
+ self::replace($word, 'ness', '', 0);
+ break;
+
+ case 't':
+ self::replace($word, 'icate', 'ic', 0)
+ OR self::replace($word, 'iciti', 'ic', 0);
+ break;
+
+ case 'u':
+ self::replace($word, 'ful', '', 0);
+ break;
+
+ case 'v':
+ self::replace($word, 'ative', '', 0);
+ break;
+
+ case 'z':
+ self::replace($word, 'alize', 'al', 0);
+ break;
+ }
+
+ return $word;
+ }
+
+ /**
+ * Step 4
+ *
+ * @param string $word Word to stem
+ */
+ private static function step4($word)
+ {
+ switch (substr($word, -2, 1)) {
+ case 'a':
+ self::replace($word, 'al', '', 1);
+ break;
+
+ case 'c':
+ self::replace($word, 'ance', '', 1)
+ OR self::replace($word, 'ence', '', 1);
+ break;
+
+ case 'e':
+ self::replace($word, 'er', '', 1);
+ break;
+
+ case 'i':
+ self::replace($word, 'ic', '', 1);
+ break;
+
+ case 'l':
+ self::replace($word, 'able', '', 1)
+ OR self::replace($word, 'ible', '', 1);
+ break;
+
+ case 'n':
+ self::replace($word, 'ant', '', 1)
+ OR self::replace($word, 'ement', '', 1)
+ OR self::replace($word, 'ment', '', 1)
+ OR self::replace($word, 'ent', '', 1);
+ break;
+
+ case 'o':
+ if (substr($word, -4) == 'tion' OR substr($word, -4) == 'sion') {
+ self::replace($word, 'ion', '', 1);
+ } else {
+ self::replace($word, 'ou', '', 1);
+ }
+ break;
+
+ case 's':
+ self::replace($word, 'ism', '', 1);
+ break;
+
+ case 't':
+ self::replace($word, 'ate', '', 1)
+ OR self::replace($word, 'iti', '', 1);
+ break;
+
+ case 'u':
+ self::replace($word, 'ous', '', 1);
+ break;
+
+ case 'v':
+ self::replace($word, 'ive', '', 1);
+ break;
+
+ case 'z':
+ self::replace($word, 'ize', '', 1);
+ break;
+ }
+
+ return $word;
+ }
+
+ /**
+ * Step 5
+ *
+ * @param string $word Word to stem
+ */
+ private static function step5($word)
+ {
+ // Part a
+ if (substr($word, -1) == 'e') {
+ if (self::m(substr($word, 0, -1)) > 1) {
+ self::replace($word, 'e', '');
+
+ } elseif (self::m(substr($word, 0, -1)) == 1) {
+
+ if (!self::cvc(substr($word, 0, -1))) {
+ self::replace($word, 'e', '');
+ }
+ }
+ }
+
+ // Part b
+ if (self::m($word) > 1 AND self::doubleConsonant($word) AND substr($word, -1) == 'l') {
+ $word = substr($word, 0, -1);
+ }
+
+ return $word;
+ }
+
+ /**
+ * Replaces the first string with the second, at the end of the string
+ *
+ * If third arg is given, then the preceding string must match that m
+ * count at least.
+ *
+ * @param string $str String to check
+ * @param string $check Ending to check for
+ * @param string $repl Replacement string
+ * @param int $m Optional minimum number of m() to meet
+ *
+ * @return bool Whether the $check string was at the end of the $str
+ * string. True does not necessarily mean that it was
+ * replaced.
+ */
+ private static function replace(&$str, $check, $repl, $m = null)
+ {
+ $len = 0 - strlen($check);
+
+ if (substr($str, $len) == $check) {
+ $substr = substr($str, 0, $len);
+ if (is_null($m) OR self::m($substr) > $m) {
+ $str = $substr . $repl;
+ }
+
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * What, you mean it's not obvious from the name?
+ *
+ * m() measures the number of consonant sequences in $str. if c is
+ * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
+ * presence,
+ *
+ * <c><v> gives 0
+ * <c>vc<v> gives 1
+ * <c>vcvc<v> gives 2
+ * <c>vcvcvc<v> gives 3
+ *
+ * @param string $str The string to return the m count for
+ *
+ * @return int The m count
+ */
+ private static function m($str)
+ {
+ $c = self::$regex_consonant;
+ $v = self::$regex_vowel;
+
+ $str = preg_replace("#^$c+#", '', $str);
+ $str = preg_replace("#$v+$#", '', $str);
+
+ preg_match_all("#($v+$c+)#", $str, $matches);
+
+ return count($matches[1]);
+ }
+
+ /**
+ * Returns true/false as to whether the given string contains two
+ * of the same consonant next to each other at the end of the string.
+ *
+ * @param string $str String to check
+ *
+ * @return bool Result
+ */
+ private static function doubleConsonant($str)
+ {
+ $c = self::$regex_consonant;
+
+ return preg_match("#$c{2}$#", $str, $matches) AND $matches[0]{0} == $matches[0]{1};
+ }
+
+ /**
+ * Checks for ending CVC sequence where second C is not W, X or Y
+ *
+ * @param string $str String to check
+ *
+ * @return bool Result
+ */
+ private static function cvc($str)
+ {
+ $c = self::$regex_consonant;
+ $v = self::$regex_vowel;
+
+ return preg_match("#($c$v$c)$#", $str, $matches)
+ AND strlen($matches[1]) == 3
+ AND $matches[1]{2} != 'w'
+ AND $matches[1]{2} != 'x'
+ AND $matches[1]{2} != 'y';
+ }
+}
diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php
--- a/src/__phutil_library_map__.php
+++ b/src/__phutil_library_map__.php
@@ -5658,6 +5658,12 @@
'PhutilRemarkupTableBlockRule' => 'infrastructure/markup/blockrule/PhutilRemarkupTableBlockRule.php',
'PhutilRemarkupTestInterpreterRule' => 'infrastructure/markup/blockrule/PhutilRemarkupTestInterpreterRule.php',
'PhutilRemarkupUnderlineRule' => 'infrastructure/markup/markuprule/PhutilRemarkupUnderlineRule.php',
+ 'PhutilSearchQueryCompiler' => 'applications/search/compiler/PhutilSearchQueryCompiler.php',
+ 'PhutilSearchQueryCompilerSyntaxException' => 'applications/search/compiler/PhutilSearchQueryCompilerSyntaxException.php',
+ 'PhutilSearchQueryCompilerTestCase' => 'applications/search/compiler/__tests__/PhutilSearchQueryCompilerTestCase.php',
+ 'PhutilSearchQueryToken' => 'applications/search/compiler/PhutilSearchQueryToken.php',
+ 'PhutilSearchStemmer' => 'applications/search/compiler/PhutilSearchStemmer.php',
+ 'PhutilSearchStemmerTestCase' => 'applications/search/compiler/__tests__/PhutilSearchStemmerTestCase.php',
'PhutilSlackAuthAdapter' => 'applications/auth/adapter/PhutilSlackAuthAdapter.php',
'PhutilTwitchAuthAdapter' => 'applications/auth/adapter/PhutilTwitchAuthAdapter.php',
'PhutilTwitterAuthAdapter' => 'applications/auth/adapter/PhutilTwitterAuthAdapter.php',
@@ -12483,6 +12489,12 @@
'PhutilRemarkupTableBlockRule' => 'PhutilRemarkupBlockRule',
'PhutilRemarkupTestInterpreterRule' => 'PhutilRemarkupBlockInterpreter',
'PhutilRemarkupUnderlineRule' => 'PhutilRemarkupRule',
+ 'PhutilSearchQueryCompiler' => 'Phobject',
+ 'PhutilSearchQueryCompilerSyntaxException' => 'Exception',
+ 'PhutilSearchQueryCompilerTestCase' => 'PhutilTestCase',
+ 'PhutilSearchQueryToken' => 'Phobject',
+ 'PhutilSearchStemmer' => 'Phobject',
+ 'PhutilSearchStemmerTestCase' => 'PhutilTestCase',
'PhutilSlackAuthAdapter' => 'PhutilOAuthAuthAdapter',
'PhutilTwitchAuthAdapter' => 'PhutilOAuthAuthAdapter',
'PhutilTwitterAuthAdapter' => 'PhutilOAuth1AuthAdapter',
diff --git a/src/applications/search/compiler/PhutilSearchQueryCompiler.php b/src/applications/search/compiler/PhutilSearchQueryCompiler.php
new file mode 100644
--- /dev/null
+++ b/src/applications/search/compiler/PhutilSearchQueryCompiler.php
@@ -0,0 +1,374 @@
+<?php
+
+final class PhutilSearchQueryCompiler
+ extends Phobject {
+
+ private $operators = '+ -><()~*:""&|';
+ private $query;
+ private $stemmer;
+ private $enableFunctions = false;
+
+ const OPERATOR_NOT = 'not';
+ const OPERATOR_AND = 'and';
+ const OPERATOR_SUBSTRING = 'sub';
+ const OPERATOR_EXACT = 'exact';
+
+ public function setOperators($operators) {
+ $this->operators = $operators;
+ return $this;
+ }
+
+ public function getOperators() {
+ return $this->operators;
+ }
+
+ public function setStemmer(PhutilSearchStemmer $stemmer) {
+ $this->stemmer = $stemmer;
+ return $this;
+ }
+
+ public function getStemmer() {
+ return $this->stemmer;
+ }
+
+ public function setEnableFunctions($enable_functions) {
+ $this->enableFunctions = $enable_functions;
+ return $this;
+ }
+
+ public function getEnableFunctions() {
+ return $this->enableFunctions;
+ }
+
+ public function compileQuery(array $tokens) {
+ assert_instances_of($tokens, 'PhutilSearchQueryToken');
+
+ $result = array();
+ foreach ($tokens as $token) {
+ $result[] = $this->renderToken($token);
+ }
+
+ return $this->compileRenderedTokens($result);
+ }
+
+ public function compileLiteralQuery(array $tokens) {
+ assert_instances_of($tokens, 'PhutilSearchQueryToken');
+
+ $result = array();
+ foreach ($tokens as $token) {
+ if (!$token->isQuoted()) {
+ continue;
+ }
+ $result[] = $this->renderToken($token);
+ }
+
+ return $this->compileRenderedTokens($result);
+ }
+
+ public function compileStemmedQuery(array $tokens) {
+ assert_instances_of($tokens, 'PhutilSearchQueryToken');
+
+ $result = array();
+ foreach ($tokens as $token) {
+ if ($token->isQuoted()) {
+ continue;
+ }
+ $result[] = $this->renderToken($token, $this->getStemmer());
+ }
+
+ return $this->compileRenderedTokens($result);
+ }
+
+ private function compileRenderedTokens(array $list) {
+ if (!$list) {
+ return null;
+ }
+
+ $list = array_unique($list);
+ return implode(' ', $list);
+ }
+
+ public function newTokens($query) {
+ $results = $this->tokenizeQuery($query);
+
+ $tokens = array();
+ foreach ($results as $result) {
+ $tokens[] = PhutilSearchQueryToken::newFromDictionary($result);
+ }
+
+ return $tokens;
+ }
+
+ private function tokenizeQuery($query) {
+ $maximum_bytes = 1024;
+
+ $query_bytes = strlen($query);
+ if ($query_bytes > $maximum_bytes) {
+ throw new PhutilSearchQueryCompilerSyntaxException(
+ pht(
+ 'Query is too long (%s bytes, maximum is %s bytes).',
+ new PhutilNumber($query_bytes),
+ new PhutilNumber($maximum_bytes)));
+ }
+
+ $query = phutil_utf8v($query);
+ $length = count($query);
+
+ $enable_functions = $this->getEnableFunctions();
+
+ $mode = 'scan';
+ $current_operator = array();
+ $current_token = array();
+ $current_function = null;
+ $is_quoted = false;
+ $tokens = array();
+
+ if ($enable_functions) {
+ $operator_characters = '[~=+-]';
+ } else {
+ $operator_characters = '[+-]';
+ }
+
+ for ($ii = 0; $ii < $length; $ii++) {
+ $character = $query[$ii];
+
+ if ($mode == 'scan') {
+ if (preg_match('/^\s\z/u', $character)) {
+ continue;
+ }
+
+ $mode = 'function';
+ }
+
+ if ($mode == 'function') {
+ $mode = 'operator';
+
+ if ($enable_functions) {
+ $found = false;
+ for ($jj = $ii; $jj < $length; $jj++) {
+ if (preg_match('/^[a-zA-Z]\z/u', $query[$jj])) {
+ continue;
+ }
+ if ($query[$jj] == ':') {
+ $found = $jj;
+ }
+ break;
+ }
+
+ if ($found !== false) {
+ $function = array_slice($query, $ii, ($jj - $ii));
+ $current_function = implode('', $function);
+
+ if (!strlen($current_function)) {
+ $current_function = null;
+ }
+
+ $ii = $jj;
+ continue;
+ }
+ }
+ }
+
+ if ($mode == 'operator') {
+ if (preg_match('/^\s\z/u', $character)) {
+ continue;
+ }
+
+ if (preg_match('/^'.$operator_characters.'\z/', $character)) {
+ $current_operator[] = $character;
+ continue;
+ }
+
+ $mode = 'quote';
+ }
+
+ if ($mode == 'quote') {
+ if (preg_match('/^"\z/', $character)) {
+ $is_quoted = true;
+ $mode = 'token';
+ continue;
+ }
+
+ $mode = 'token';
+ }
+
+ if ($mode == 'token') {
+ $capture = false;
+ $was_quoted = $is_quoted;
+ if ($is_quoted) {
+ if (preg_match('/^"\z/', $character)) {
+ $capture = true;
+ $mode = 'scan';
+ $is_quoted = false;
+ }
+ } else {
+ if (preg_match('/^\s\z/u', $character)) {
+ $capture = true;
+ $mode = 'scan';
+ }
+
+ if (preg_match('/^"\z/', $character)) {
+ $capture = true;
+ $mode = 'token';
+ $is_quoted = true;
+ }
+ }
+
+ if ($capture) {
+ $token = array(
+ 'operator' => $current_operator,
+ 'quoted' => $was_quoted,
+ 'value' => $current_token,
+ );
+
+ if ($enable_functions) {
+ $token['function'] = $current_function;
+ }
+
+ $tokens[] = $token;
+
+ $current_operator = array();
+ $current_token = array();
+ $current_function = null;
+ continue;
+ } else {
+ $current_token[] = $character;
+ }
+ }
+ }
+
+ if ($is_quoted) {
+ throw new PhutilSearchQueryCompilerSyntaxException(
+ pht(
+ 'Query contains unmatched double quotes.'));
+ }
+
+ if ($mode == 'operator') {
+ throw new PhutilSearchQueryCompilerSyntaxException(
+ pht(
+ 'Query contains operator ("%s") with no search term.',
+ implode('', $current_operator)));
+ }
+
+ $token = array(
+ 'operator' => $current_operator,
+ 'quoted' => false,
+ 'value' => $current_token,
+ );
+
+ if ($enable_functions) {
+ $token['function'] = $current_function;
+ }
+
+ $tokens[] = $token;
+
+ $results = array();
+ foreach ($tokens as $token) {
+ $value = implode('', $token['value']);
+ $operator_string = implode('', $token['operator']);
+
+ if (!strlen($value)) {
+ continue;
+ }
+
+ $is_quoted = $token['quoted'];
+
+ switch ($operator_string) {
+ case '-':
+ $operator = self::OPERATOR_NOT;
+ break;
+ case '~':
+ $operator = self::OPERATOR_SUBSTRING;
+ break;
+ case '=':
+ $operator = self::OPERATOR_EXACT;
+ break;
+ case '+':
+ $operator = self::OPERATOR_AND;
+ break;
+ case '':
+ // See T12995. If this query term contains Chinese, Japanese or
+ // Korean characters, treat the term as a substring term by default.
+ // These languages do not separate words with spaces, so the term
+ // search mode is normally useless.
+ if ($enable_functions && !$is_quoted && phutil_utf8_is_cjk($value)) {
+ $operator = self::OPERATOR_SUBSTRING;
+ } else {
+ $operator = self::OPERATOR_AND;
+ }
+ break;
+ default:
+ throw new PhutilSearchQueryCompilerSyntaxException(
+ pht(
+ 'Query has an invalid sequence of operators ("%s").',
+ $operator_string));
+ }
+
+ $result = array(
+ 'operator' => $operator,
+ 'quoted' => $is_quoted,
+ 'value' => $value,
+ );
+
+ if ($enable_functions) {
+ $result['function'] = $token['function'];
+ }
+
+ $results[] = $result;
+ }
+
+ return $results;
+ }
+
+ private function renderToken(
+ PhutilSearchQueryToken $token,
+ PhutilSearchStemmer $stemmer = null) {
+ $value = $token->getValue();
+
+ if ($stemmer) {
+ $value = $stemmer->stemToken($value);
+ }
+
+ $value = $this->quoteToken($value);
+ $operator = $token->getOperator();
+ $prefix = $this->getOperatorPrefix($operator);
+
+ $value = $prefix.$value;
+
+ return $value;
+ }
+
+ private function getOperatorPrefix($operator) {
+ $operators = $this->operators;
+
+ switch ($operator) {
+ case self::OPERATOR_AND:
+ $prefix = $operators[0];
+ break;
+ case self::OPERATOR_NOT:
+ $prefix = $operators[2];
+ break;
+ default:
+ throw new PhutilSearchQueryCompilerSyntaxException(
+ pht(
+ 'Unsupported operator prefix "%s".',
+ $operator));
+ }
+
+ if ($prefix == ' ') {
+ $prefix = null;
+ }
+
+ return $prefix;
+ }
+
+ private function quoteToken($value) {
+ $operators = $this->operators;
+
+ $open_quote = $this->operators[10];
+ $close_quote = $this->operators[11];
+
+ return $open_quote.$value.$close_quote;
+ }
+
+}
diff --git a/src/applications/search/compiler/PhutilSearchQueryCompilerSyntaxException.php b/src/applications/search/compiler/PhutilSearchQueryCompilerSyntaxException.php
new file mode 100644
--- /dev/null
+++ b/src/applications/search/compiler/PhutilSearchQueryCompilerSyntaxException.php
@@ -0,0 +1,4 @@
+<?php
+
+final class PhutilSearchQueryCompilerSyntaxException
+ extends Exception {}
diff --git a/src/applications/search/compiler/PhutilSearchQueryToken.php b/src/applications/search/compiler/PhutilSearchQueryToken.php
new file mode 100644
--- /dev/null
+++ b/src/applications/search/compiler/PhutilSearchQueryToken.php
@@ -0,0 +1,37 @@
+<?php
+
+final class PhutilSearchQueryToken extends Phobject {
+
+ private $isQuoted;
+ private $value;
+ private $operator;
+ private $function;
+
+ public static function newFromDictionary(array $dictionary) {
+ $token = new self();
+
+ $token->isQuoted = $dictionary['quoted'];
+ $token->operator = $dictionary['operator'];
+ $token->value = $dictionary['value'];
+ $token->function = idx($dictionary, 'function');
+
+ return $token;
+ }
+
+ public function isQuoted() {
+ return $this->isQuoted;
+ }
+
+ public function getValue() {
+ return $this->value;
+ }
+
+ public function getOperator() {
+ return $this->operator;
+ }
+
+ public function getFunction() {
+ return $this->function;
+ }
+
+}
diff --git a/src/applications/search/compiler/PhutilSearchStemmer.php b/src/applications/search/compiler/PhutilSearchStemmer.php
new file mode 100644
--- /dev/null
+++ b/src/applications/search/compiler/PhutilSearchStemmer.php
@@ -0,0 +1,74 @@
+<?php
+
+final class PhutilSearchStemmer
+ extends Phobject {
+
+ public function stemToken($token) {
+ $token = $this->normalizeToken($token);
+ return $this->applyStemmer($token);
+ }
+
+ public function stemCorpus($corpus) {
+ $corpus = $this->normalizeCorpus($corpus);
+ $tokens = preg_split('/[^a-zA-Z0-9\x7F-\xFF._]+/', $corpus);
+
+ $words = array();
+ foreach ($tokens as $key => $token) {
+ $token = trim($token, '._');
+
+ if (strlen($token) < 3) {
+ continue;
+ }
+
+ $words[$token] = $token;
+ }
+
+ $stems = array();
+ foreach ($words as $word) {
+ $stems[] = $this->applyStemmer($word);
+ }
+
+ return implode(' ', $stems);
+ }
+
+ private function normalizeToken($token) {
+ return phutil_utf8_strtolower($token);
+ }
+
+ private function normalizeCorpus($corpus) {
+ return phutil_utf8_strtolower($corpus);
+ }
+
+ /**
+ * @phutil-external-symbol class Porter
+ */
+ private function applyStemmer($normalized_token) {
+ // If the token has internal punctuation, handle it literally. This
+ // deals with things like domain names, Conduit API methods, and other
+ // sorts of informal tokens.
+ if (preg_match('/[._]/', $normalized_token)) {
+ return $normalized_token;
+ }
+
+ static $loaded;
+
+ if ($loaded === null) {
+ $root = dirname(phutil_get_library_root('phabricator'));
+ require_once $root.'/externals/porter-stemmer/src/Porter.php';
+ $loaded = true;
+ }
+
+
+ $stem = Porter::stem($normalized_token);
+
+ // If the stem is too short, it won't be a candidate for indexing. These
+ // tokens are also likely to be acronyms (like "DNS") rather than real
+ // English words.
+ if (strlen($stem) < 3) {
+ return $normalized_token;
+ }
+
+ return $stem;
+ }
+
+}
diff --git a/src/applications/search/compiler/__tests__/PhutilSearchQueryCompilerTestCase.php b/src/applications/search/compiler/__tests__/PhutilSearchQueryCompilerTestCase.php
new file mode 100644
--- /dev/null
+++ b/src/applications/search/compiler/__tests__/PhutilSearchQueryCompilerTestCase.php
@@ -0,0 +1,220 @@
+<?php
+
+final class PhutilSearchQueryCompilerTestCase
+ extends PhutilTestCase {
+
+ public function testCompileQueries() {
+ $tests = array(
+ '' => null,
+ 'cat dog' => '+"cat" +"dog"',
+ 'cat -dog' => '+"cat" -"dog"',
+ 'cat-dog' => '+"cat-dog"',
+
+ // If there are spaces after an operator, the operator applies to the
+ // next search term.
+ 'cat - dog' => '+"cat" -"dog"',
+
+ // Double quotes serve as delimiters even if there is no whitespace
+ // between terms.
+ '"cat"dog' => '+"cat" +"dog"',
+
+ // This query is too long.
+ str_repeat('x', 2048) => false,
+
+ // Multiple operators are not permitted.
+ '++cat' => false,
+ '+-cat' => false,
+ '--cat' => false,
+
+ // Stray operators are not permitted.
+ '+' => false,
+ 'cat +' => false,
+
+ // Double quotes must be paired.
+ '"' => false,
+ 'cat "' => false,
+ '"cat' => false,
+ 'A"' => false,
+ 'A"B"' => '+"A" +"B"',
+ );
+
+ $this->assertCompileQueries($tests);
+
+ // Test that we compile queries correctly if the operators have been
+ // swapped to use "AND" by default.
+ $operator_tests = array(
+ 'cat dog' => '"cat" "dog"',
+ 'cat -dog' => '"cat" -"dog"',
+ );
+ $this->assertCompileQueries($operator_tests, ' |-><()~*:""&\'');
+
+
+ // Test that we compile queries correctly if the quote operators have
+ // been swapped to differ.
+ $quote_tests = array(
+ 'cat dog' => '+[cat] +[dog]',
+ 'cat -dog' => '+[cat] -[dog]',
+ );
+ $this->assertCompileQueries($quote_tests, '+ -><()~*:[]&|');
+
+ }
+
+ public function testCompileQueriesWithStemming() {
+ $stemming_tests = array(
+ 'cat dog' => array(
+ null,
+ '+"cat" +"dog"',
+ ),
+ 'cats dogs' => array(
+ null,
+ '+"cat" +"dog"',
+ ),
+ 'cats "dogs"' => array(
+ '+"dogs"',
+ '+"cat"',
+ ),
+ '"blessed blade" of the windseeker' => array(
+ '+"blessed blade"',
+ '+"of" +"the" +"windseek"',
+ ),
+ 'mailing users for mentions on tasks' => array(
+ null,
+ '+"mail" +"user" +"for" +"mention" +"on" +"task"',
+ ),
+ );
+
+ $stemmer = new PhutilSearchStemmer();
+ $this->assertCompileQueries($stemming_tests, null, $stemmer);
+ }
+
+ public function testCompileQueriesWithFunctions() {
+ $op_and = PhutilSearchQueryCompiler::OPERATOR_AND;
+ $op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING;
+ $op_exact = PhutilSearchQueryCompiler::OPERATOR_EXACT;
+
+ $mao = "\xE7\x8C\xAB";
+
+ $function_tests = array(
+ 'cat' => array(
+ array(null, $op_and, 'cat'),
+ ),
+ ':cat' => array(
+ array(null, $op_and, 'cat'),
+ ),
+ 'title:cat' => array(
+ array('title', $op_and, 'cat'),
+ ),
+ 'title:cat:dog' => array(
+ array('title', $op_and, 'cat:dog'),
+ ),
+ 'title:~cat' => array(
+ array('title', $op_sub, 'cat'),
+ ),
+ 'cat title:="Meow Meow"' => array(
+ array(null, $op_and, 'cat'),
+ array('title', $op_exact, 'Meow Meow'),
+ ),
+ 'title:cat title:dog' => array(
+ array('title', $op_and, 'cat'),
+ array('title', $op_and, 'dog'),
+ ),
+ '~"core and seven years ag"' => array(
+ array(null, $op_sub, 'core and seven years ag'),
+ ),
+ $mao => array(
+ array(null, $op_sub, $mao),
+ ),
+ '+'.$mao => array(
+ array(null, $op_and, $mao),
+ ),
+ '~'.$mao => array(
+ array(null, $op_sub, $mao),
+ ),
+ '"'.$mao.'"' => array(
+ array(null, $op_and, $mao),
+ ),
+ );
+
+ $this->assertCompileFunctionQueries($function_tests);
+ }
+
+ private function assertCompileQueries(
+ array $tests,
+ $operators = null,
+ PhutilSearchStemmer $stemmer = null) {
+ foreach ($tests as $input => $expect) {
+ $caught = null;
+
+ $query = null;
+ $literal_query = null;
+ $stemmed_query = null;
+
+ try {
+ $compiler = new PhutilSearchQueryCompiler();
+
+ if ($operators !== null) {
+ $compiler->setOperators($operators);
+ }
+
+ if ($stemmer !== null) {
+ $compiler->setStemmer($stemmer);
+ }
+
+ $tokens = $compiler->newTokens($input);
+
+ if ($stemmer) {
+ $literal_query = $compiler->compileLiteralQuery($tokens);
+ $stemmed_query = $compiler->compileStemmedQuery($tokens);
+ } else {
+ $query = $compiler->compileQuery($tokens);
+ }
+ } catch (PhutilSearchQueryCompilerSyntaxException $ex) {
+ $caught = $ex;
+ }
+
+ if ($caught !== null) {
+ $query = false;
+ $literal_query = false;
+ $stemmed_query = false;
+ }
+
+ if (!$stemmer) {
+ $this->assertEqual(
+ $expect,
+ $query,
+ pht('Compilation of query: %s', $input));
+ } else {
+ $this->assertEqual(
+ $expect,
+ ($literal_query === false)
+ ? false
+ : array($literal_query, $stemmed_query),
+ pht('Stemmed compilation of query: %s', $input));
+ }
+ }
+ }
+
+ private function assertCompileFunctionQueries(array $tests) {
+ foreach ($tests as $input => $expect) {
+ $compiler = id(new PhutilSearchQueryCompiler())
+ ->setEnableFunctions(true);
+
+ $tokens = $compiler->newTokens($input);
+
+ $result = array();
+ foreach ($tokens as $token) {
+ $result[] = array(
+ $token->getFunction(),
+ $token->getOperator(),
+ $token->getValue(),
+ );
+ }
+
+ $this->assertEqual(
+ $expect,
+ $result,
+ pht('Function compilation of query: %s', $input));
+ }
+ }
+
+}
diff --git a/src/applications/search/compiler/__tests__/PhutilSearchStemmerTestCase.php b/src/applications/search/compiler/__tests__/PhutilSearchStemmerTestCase.php
new file mode 100644
--- /dev/null
+++ b/src/applications/search/compiler/__tests__/PhutilSearchStemmerTestCase.php
@@ -0,0 +1,85 @@
+<?php
+
+final class PhutilSearchStemmerTestCase
+ extends PhutilTestCase {
+
+ public function testStemTokens() {
+ $tests = array(
+ // Various real-world cases collected from users before we implemented
+ // stemming.
+ 'tokens' => 'token',
+ 'panels' => 'panel',
+
+ 'renames' => 'renam',
+ 'rename' => 'renam',
+
+ 'components' => 'compon',
+ 'component' => 'compon',
+
+ 'implementation' => 'implement',
+ 'implements' => 'implement',
+ 'implementing' => 'implement',
+ 'implementer' => 'implement',
+
+ 'deleting' => 'delet',
+ 'deletion' => 'delet',
+ 'delete' => 'delet',
+
+ 'erratically' => 'errat',
+ 'erratic' => 'errat',
+
+ // Stems should be normalized.
+ 'DOG' => 'dog',
+
+ // If stemming would bring a token under 3 characters, it should not
+ // be stemmed.
+ 'dns' => 'dns',
+ 'nis' => 'nis',
+
+ // Complex tokens with internal punctuation should be left untouched;
+ // these are usually things like domain names, API calls, informal tags,
+ // etc.
+ 'apples' => 'appl',
+ 'bananas' => 'banana',
+ 'apples_bananas' => 'apples_bananas',
+ 'apples_bananas.apples_bananas' => 'apples_bananas.apples_bananas',
+ );
+
+ $stemmer = new PhutilSearchStemmer();
+ foreach ($tests as $input => $expect) {
+ $stem = $stemmer->stemToken($input);
+ $this->assertEqual(
+ $expect,
+ $stem,
+ pht('Token stem of "%s".', $input));
+ }
+ }
+
+ public function testStemDocuments() {
+ $tests = array(
+ 'The wild boar meandered erratically.' =>
+ 'the wild boar meander errat',
+ 'Fool me onc, shame on you. Fool me twice, shame on me.' =>
+ 'fool onc shame you twice',
+ 'Fireball is a seventh-level spell which deals 2d16 points of damage '.
+ 'in a 1-meter radius around a target.' =>
+ 'firebal seventh level spell which deal 2d16 point damag meter '.
+ 'radiu around target',
+ 'apples-bananas' => 'appl banana',
+ 'apples_bananas' => 'apples_bananas',
+ 'apples.bananas' => 'apples.bananas',
+ 'oddly-proportioned' => 'oddli proport',
+ );
+
+ $stemmer = new PhutilSearchStemmer();
+ foreach ($tests as $input => $expect) {
+ $stem = $stemmer->stemCorpus($input);
+ $this->assertEqual(
+ $expect,
+ $stem,
+ pht('Corpus stem of: %s', $input));
+ }
+ }
+
+
+}

File Metadata

Mime Type
text/plain
Expires
Fri, Oct 10, 3:00 AM (2 w, 1 d ago)
Storage Engine
amazon-s3
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
phabricator/secure/so/xn/zg3x2wwbnb3ybxjf
Default Alt Text
D20939.diff (38 KB)

Event Timeline