diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php --- a/src/__phutil_library_map__.php +++ b/src/__phutil_library_map__.php @@ -5658,6 +5658,12 @@ 'PhutilRemarkupTableBlockRule' => 'infrastructure/markup/blockrule/PhutilRemarkupTableBlockRule.php', 'PhutilRemarkupTestInterpreterRule' => 'infrastructure/markup/blockrule/PhutilRemarkupTestInterpreterRule.php', 'PhutilRemarkupUnderlineRule' => 'infrastructure/markup/markuprule/PhutilRemarkupUnderlineRule.php', + 'PhutilSearchQueryCompiler' => 'applications/search/compiler/PhutilSearchQueryCompiler.php', + 'PhutilSearchQueryCompilerSyntaxException' => 'applications/search/compiler/PhutilSearchQueryCompilerSyntaxException.php', + 'PhutilSearchQueryCompilerTestCase' => 'applications/search/compiler/__tests__/PhutilSearchQueryCompilerTestCase.php', + 'PhutilSearchQueryToken' => 'applications/search/compiler/PhutilSearchQueryToken.php', + 'PhutilSearchStemmer' => 'applications/search/compiler/PhutilSearchStemmer.php', + 'PhutilSearchStemmerTestCase' => 'applications/search/compiler/__tests__/PhutilSearchStemmerTestCase.php', 'PhutilSlackAuthAdapter' => 'applications/auth/adapter/PhutilSlackAuthAdapter.php', 'PhutilTwitchAuthAdapter' => 'applications/auth/adapter/PhutilTwitchAuthAdapter.php', 'PhutilTwitterAuthAdapter' => 'applications/auth/adapter/PhutilTwitterAuthAdapter.php', @@ -12483,6 +12489,12 @@ 'PhutilRemarkupTableBlockRule' => 'PhutilRemarkupBlockRule', 'PhutilRemarkupTestInterpreterRule' => 'PhutilRemarkupBlockInterpreter', 'PhutilRemarkupUnderlineRule' => 'PhutilRemarkupRule', + 'PhutilSearchQueryCompiler' => 'Phobject', + 'PhutilSearchQueryCompilerSyntaxException' => 'Exception', + 'PhutilSearchQueryCompilerTestCase' => 'PhutilTestCase', + 'PhutilSearchQueryToken' => 'Phobject', + 'PhutilSearchStemmer' => 'Phobject', + 'PhutilSearchStemmerTestCase' => 'PhutilTestCase', 'PhutilSlackAuthAdapter' => 'PhutilOAuthAuthAdapter', 'PhutilTwitchAuthAdapter' => 'PhutilOAuthAuthAdapter', 'PhutilTwitterAuthAdapter' => 'PhutilOAuth1AuthAdapter', diff --git a/src/applications/search/compiler/PhutilSearchQueryCompiler.php b/src/applications/search/compiler/PhutilSearchQueryCompiler.php new file mode 100644 --- /dev/null +++ b/src/applications/search/compiler/PhutilSearchQueryCompiler.php @@ -0,0 +1,374 @@ +<()~*:""&|'; + private $query; + private $stemmer; + private $enableFunctions = false; + + const OPERATOR_NOT = 'not'; + const OPERATOR_AND = 'and'; + const OPERATOR_SUBSTRING = 'sub'; + const OPERATOR_EXACT = 'exact'; + + public function setOperators($operators) { + $this->operators = $operators; + return $this; + } + + public function getOperators() { + return $this->operators; + } + + public function setStemmer(PhutilSearchStemmer $stemmer) { + $this->stemmer = $stemmer; + return $this; + } + + public function getStemmer() { + return $this->stemmer; + } + + public function setEnableFunctions($enable_functions) { + $this->enableFunctions = $enable_functions; + return $this; + } + + public function getEnableFunctions() { + return $this->enableFunctions; + } + + public function compileQuery(array $tokens) { + assert_instances_of($tokens, 'PhutilSearchQueryToken'); + + $result = array(); + foreach ($tokens as $token) { + $result[] = $this->renderToken($token); + } + + return $this->compileRenderedTokens($result); + } + + public function compileLiteralQuery(array $tokens) { + assert_instances_of($tokens, 'PhutilSearchQueryToken'); + + $result = array(); + foreach ($tokens as $token) { + if (!$token->isQuoted()) { + continue; + } + $result[] = $this->renderToken($token); + } + + return $this->compileRenderedTokens($result); + } + + public function compileStemmedQuery(array $tokens) { + assert_instances_of($tokens, 'PhutilSearchQueryToken'); + + $result = array(); + foreach ($tokens as $token) { + if ($token->isQuoted()) { + continue; + } + $result[] = $this->renderToken($token, $this->getStemmer()); + } + + return $this->compileRenderedTokens($result); + } + + private function compileRenderedTokens(array $list) { + if (!$list) { + return null; + } + + $list = array_unique($list); + return implode(' ', $list); + } + + public function newTokens($query) { + $results = $this->tokenizeQuery($query); + + $tokens = array(); + foreach ($results as $result) { + $tokens[] = PhutilSearchQueryToken::newFromDictionary($result); + } + + return $tokens; + } + + private function tokenizeQuery($query) { + $maximum_bytes = 1024; + + $query_bytes = strlen($query); + if ($query_bytes > $maximum_bytes) { + throw new PhutilSearchQueryCompilerSyntaxException( + pht( + 'Query is too long (%s bytes, maximum is %s bytes).', + new PhutilNumber($query_bytes), + new PhutilNumber($maximum_bytes))); + } + + $query = phutil_utf8v($query); + $length = count($query); + + $enable_functions = $this->getEnableFunctions(); + + $mode = 'scan'; + $current_operator = array(); + $current_token = array(); + $current_function = null; + $is_quoted = false; + $tokens = array(); + + if ($enable_functions) { + $operator_characters = '[~=+-]'; + } else { + $operator_characters = '[+-]'; + } + + for ($ii = 0; $ii < $length; $ii++) { + $character = $query[$ii]; + + if ($mode == 'scan') { + if (preg_match('/^\s\z/u', $character)) { + continue; + } + + $mode = 'function'; + } + + if ($mode == 'function') { + $mode = 'operator'; + + if ($enable_functions) { + $found = false; + for ($jj = $ii; $jj < $length; $jj++) { + if (preg_match('/^[a-zA-Z]\z/u', $query[$jj])) { + continue; + } + if ($query[$jj] == ':') { + $found = $jj; + } + break; + } + + if ($found !== false) { + $function = array_slice($query, $ii, ($jj - $ii)); + $current_function = implode('', $function); + + if (!strlen($current_function)) { + $current_function = null; + } + + $ii = $jj; + continue; + } + } + } + + if ($mode == 'operator') { + if (preg_match('/^\s\z/u', $character)) { + continue; + } + + if (preg_match('/^'.$operator_characters.'\z/', $character)) { + $current_operator[] = $character; + continue; + } + + $mode = 'quote'; + } + + if ($mode == 'quote') { + if (preg_match('/^"\z/', $character)) { + $is_quoted = true; + $mode = 'token'; + continue; + } + + $mode = 'token'; + } + + if ($mode == 'token') { + $capture = false; + $was_quoted = $is_quoted; + if ($is_quoted) { + if (preg_match('/^"\z/', $character)) { + $capture = true; + $mode = 'scan'; + $is_quoted = false; + } + } else { + if (preg_match('/^\s\z/u', $character)) { + $capture = true; + $mode = 'scan'; + } + + if (preg_match('/^"\z/', $character)) { + $capture = true; + $mode = 'token'; + $is_quoted = true; + } + } + + if ($capture) { + $token = array( + 'operator' => $current_operator, + 'quoted' => $was_quoted, + 'value' => $current_token, + ); + + if ($enable_functions) { + $token['function'] = $current_function; + } + + $tokens[] = $token; + + $current_operator = array(); + $current_token = array(); + $current_function = null; + continue; + } else { + $current_token[] = $character; + } + } + } + + if ($is_quoted) { + throw new PhutilSearchQueryCompilerSyntaxException( + pht( + 'Query contains unmatched double quotes.')); + } + + if ($mode == 'operator') { + throw new PhutilSearchQueryCompilerSyntaxException( + pht( + 'Query contains operator ("%s") with no search term.', + implode('', $current_operator))); + } + + $token = array( + 'operator' => $current_operator, + 'quoted' => false, + 'value' => $current_token, + ); + + if ($enable_functions) { + $token['function'] = $current_function; + } + + $tokens[] = $token; + + $results = array(); + foreach ($tokens as $token) { + $value = implode('', $token['value']); + $operator_string = implode('', $token['operator']); + + if (!strlen($value)) { + continue; + } + + $is_quoted = $token['quoted']; + + switch ($operator_string) { + case '-': + $operator = self::OPERATOR_NOT; + break; + case '~': + $operator = self::OPERATOR_SUBSTRING; + break; + case '=': + $operator = self::OPERATOR_EXACT; + break; + case '+': + $operator = self::OPERATOR_AND; + break; + case '': + // See T12995. If this query term contains Chinese, Japanese or + // Korean characters, treat the term as a substring term by default. + // These languages do not separate words with spaces, so the term + // search mode is normally useless. + if ($enable_functions && !$is_quoted && phutil_utf8_is_cjk($value)) { + $operator = self::OPERATOR_SUBSTRING; + } else { + $operator = self::OPERATOR_AND; + } + break; + default: + throw new PhutilSearchQueryCompilerSyntaxException( + pht( + 'Query has an invalid sequence of operators ("%s").', + $operator_string)); + } + + $result = array( + 'operator' => $operator, + 'quoted' => $is_quoted, + 'value' => $value, + ); + + if ($enable_functions) { + $result['function'] = $token['function']; + } + + $results[] = $result; + } + + return $results; + } + + private function renderToken( + PhutilSearchQueryToken $token, + PhutilSearchStemmer $stemmer = null) { + $value = $token->getValue(); + + if ($stemmer) { + $value = $stemmer->stemToken($value); + } + + $value = $this->quoteToken($value); + $operator = $token->getOperator(); + $prefix = $this->getOperatorPrefix($operator); + + $value = $prefix.$value; + + return $value; + } + + private function getOperatorPrefix($operator) { + $operators = $this->operators; + + switch ($operator) { + case self::OPERATOR_AND: + $prefix = $operators[0]; + break; + case self::OPERATOR_NOT: + $prefix = $operators[2]; + break; + default: + throw new PhutilSearchQueryCompilerSyntaxException( + pht( + 'Unsupported operator prefix "%s".', + $operator)); + } + + if ($prefix == ' ') { + $prefix = null; + } + + return $prefix; + } + + private function quoteToken($value) { + $operators = $this->operators; + + $open_quote = $this->operators[10]; + $close_quote = $this->operators[11]; + + return $open_quote.$value.$close_quote; + } + +} diff --git a/src/applications/search/compiler/PhutilSearchQueryCompilerSyntaxException.php b/src/applications/search/compiler/PhutilSearchQueryCompilerSyntaxException.php new file mode 100644 --- /dev/null +++ b/src/applications/search/compiler/PhutilSearchQueryCompilerSyntaxException.php @@ -0,0 +1,4 @@ +isQuoted = $dictionary['quoted']; + $token->operator = $dictionary['operator']; + $token->value = $dictionary['value']; + $token->function = idx($dictionary, 'function'); + + return $token; + } + + public function isQuoted() { + return $this->isQuoted; + } + + public function getValue() { + return $this->value; + } + + public function getOperator() { + return $this->operator; + } + + public function getFunction() { + return $this->function; + } + +} diff --git a/src/applications/search/compiler/PhutilSearchStemmer.php b/src/applications/search/compiler/PhutilSearchStemmer.php new file mode 100644 --- /dev/null +++ b/src/applications/search/compiler/PhutilSearchStemmer.php @@ -0,0 +1,74 @@ +normalizeToken($token); + return $this->applyStemmer($token); + } + + public function stemCorpus($corpus) { + $corpus = $this->normalizeCorpus($corpus); + $tokens = preg_split('/[^a-zA-Z0-9\x7F-\xFF._]+/', $corpus); + + $words = array(); + foreach ($tokens as $key => $token) { + $token = trim($token, '._'); + + if (strlen($token) < 3) { + continue; + } + + $words[$token] = $token; + } + + $stems = array(); + foreach ($words as $word) { + $stems[] = $this->applyStemmer($word); + } + + return implode(' ', $stems); + } + + private function normalizeToken($token) { + return phutil_utf8_strtolower($token); + } + + private function normalizeCorpus($corpus) { + return phutil_utf8_strtolower($corpus); + } + + /** + * @phutil-external-symbol class Porter + */ + private function applyStemmer($normalized_token) { + // If the token has internal punctuation, handle it literally. This + // deals with things like domain names, Conduit API methods, and other + // sorts of informal tokens. + if (preg_match('/[._]/', $normalized_token)) { + return $normalized_token; + } + + static $loaded; + + if ($loaded === null) { + $root = dirname(phutil_get_library_root('phutil')); + require_once $root.'/externals/porter-stemmer/src/Porter.php'; + $loaded = true; + } + + + $stem = Porter::stem($normalized_token); + + // If the stem is too short, it won't be a candidate for indexing. These + // tokens are also likely to be acronyms (like "DNS") rather than real + // English words. + if (strlen($stem) < 3) { + return $normalized_token; + } + + return $stem; + } + +} diff --git a/src/applications/search/compiler/__tests__/PhutilSearchQueryCompilerTestCase.php b/src/applications/search/compiler/__tests__/PhutilSearchQueryCompilerTestCase.php new file mode 100644 --- /dev/null +++ b/src/applications/search/compiler/__tests__/PhutilSearchQueryCompilerTestCase.php @@ -0,0 +1,220 @@ + null, + 'cat dog' => '+"cat" +"dog"', + 'cat -dog' => '+"cat" -"dog"', + 'cat-dog' => '+"cat-dog"', + + // If there are spaces after an operator, the operator applies to the + // next search term. + 'cat - dog' => '+"cat" -"dog"', + + // Double quotes serve as delimiters even if there is no whitespace + // between terms. + '"cat"dog' => '+"cat" +"dog"', + + // This query is too long. + str_repeat('x', 2048) => false, + + // Multiple operators are not permitted. + '++cat' => false, + '+-cat' => false, + '--cat' => false, + + // Stray operators are not permitted. + '+' => false, + 'cat +' => false, + + // Double quotes must be paired. + '"' => false, + 'cat "' => false, + '"cat' => false, + 'A"' => false, + 'A"B"' => '+"A" +"B"', + ); + + $this->assertCompileQueries($tests); + + // Test that we compile queries correctly if the operators have been + // swapped to use "AND" by default. + $operator_tests = array( + 'cat dog' => '"cat" "dog"', + 'cat -dog' => '"cat" -"dog"', + ); + $this->assertCompileQueries($operator_tests, ' |-><()~*:""&\''); + + + // Test that we compile queries correctly if the quote operators have + // been swapped to differ. + $quote_tests = array( + 'cat dog' => '+[cat] +[dog]', + 'cat -dog' => '+[cat] -[dog]', + ); + $this->assertCompileQueries($quote_tests, '+ -><()~*:[]&|'); + + } + + public function testCompileQueriesWithStemming() { + $stemming_tests = array( + 'cat dog' => array( + null, + '+"cat" +"dog"', + ), + 'cats dogs' => array( + null, + '+"cat" +"dog"', + ), + 'cats "dogs"' => array( + '+"dogs"', + '+"cat"', + ), + '"blessed blade" of the windseeker' => array( + '+"blessed blade"', + '+"of" +"the" +"windseek"', + ), + 'mailing users for mentions on tasks' => array( + null, + '+"mail" +"user" +"for" +"mention" +"on" +"task"', + ), + ); + + $stemmer = new PhutilSearchStemmer(); + $this->assertCompileQueries($stemming_tests, null, $stemmer); + } + + public function testCompileQueriesWithFunctions() { + $op_and = PhutilSearchQueryCompiler::OPERATOR_AND; + $op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING; + $op_exact = PhutilSearchQueryCompiler::OPERATOR_EXACT; + + $mao = "\xE7\x8C\xAB"; + + $function_tests = array( + 'cat' => array( + array(null, $op_and, 'cat'), + ), + ':cat' => array( + array(null, $op_and, 'cat'), + ), + 'title:cat' => array( + array('title', $op_and, 'cat'), + ), + 'title:cat:dog' => array( + array('title', $op_and, 'cat:dog'), + ), + 'title:~cat' => array( + array('title', $op_sub, 'cat'), + ), + 'cat title:="Meow Meow"' => array( + array(null, $op_and, 'cat'), + array('title', $op_exact, 'Meow Meow'), + ), + 'title:cat title:dog' => array( + array('title', $op_and, 'cat'), + array('title', $op_and, 'dog'), + ), + '~"core and seven years ag"' => array( + array(null, $op_sub, 'core and seven years ag'), + ), + $mao => array( + array(null, $op_sub, $mao), + ), + '+'.$mao => array( + array(null, $op_and, $mao), + ), + '~'.$mao => array( + array(null, $op_sub, $mao), + ), + '"'.$mao.'"' => array( + array(null, $op_and, $mao), + ), + ); + + $this->assertCompileFunctionQueries($function_tests); + } + + private function assertCompileQueries( + array $tests, + $operators = null, + PhutilSearchStemmer $stemmer = null) { + foreach ($tests as $input => $expect) { + $caught = null; + + $query = null; + $literal_query = null; + $stemmed_query = null; + + try { + $compiler = new PhutilSearchQueryCompiler(); + + if ($operators !== null) { + $compiler->setOperators($operators); + } + + if ($stemmer !== null) { + $compiler->setStemmer($stemmer); + } + + $tokens = $compiler->newTokens($input); + + if ($stemmer) { + $literal_query = $compiler->compileLiteralQuery($tokens); + $stemmed_query = $compiler->compileStemmedQuery($tokens); + } else { + $query = $compiler->compileQuery($tokens); + } + } catch (PhutilSearchQueryCompilerSyntaxException $ex) { + $caught = $ex; + } + + if ($caught !== null) { + $query = false; + $literal_query = false; + $stemmed_query = false; + } + + if (!$stemmer) { + $this->assertEqual( + $expect, + $query, + pht('Compilation of query: %s', $input)); + } else { + $this->assertEqual( + $expect, + ($literal_query === false) + ? false + : array($literal_query, $stemmed_query), + pht('Stemmed compilation of query: %s', $input)); + } + } + } + + private function assertCompileFunctionQueries(array $tests) { + foreach ($tests as $input => $expect) { + $compiler = id(new PhutilSearchQueryCompiler()) + ->setEnableFunctions(true); + + $tokens = $compiler->newTokens($input); + + $result = array(); + foreach ($tokens as $token) { + $result[] = array( + $token->getFunction(), + $token->getOperator(), + $token->getValue(), + ); + } + + $this->assertEqual( + $expect, + $result, + pht('Function compilation of query: %s', $input)); + } + } + +} diff --git a/src/applications/search/compiler/__tests__/PhutilSearchStemmerTestCase.php b/src/applications/search/compiler/__tests__/PhutilSearchStemmerTestCase.php new file mode 100644 --- /dev/null +++ b/src/applications/search/compiler/__tests__/PhutilSearchStemmerTestCase.php @@ -0,0 +1,85 @@ + 'token', + 'panels' => 'panel', + + 'renames' => 'renam', + 'rename' => 'renam', + + 'components' => 'compon', + 'component' => 'compon', + + 'implementation' => 'implement', + 'implements' => 'implement', + 'implementing' => 'implement', + 'implementer' => 'implement', + + 'deleting' => 'delet', + 'deletion' => 'delet', + 'delete' => 'delet', + + 'erratically' => 'errat', + 'erratic' => 'errat', + + // Stems should be normalized. + 'DOG' => 'dog', + + // If stemming would bring a token under 3 characters, it should not + // be stemmed. + 'dns' => 'dns', + 'nis' => 'nis', + + // Complex tokens with internal punctuation should be left untouched; + // these are usually things like domain names, API calls, informal tags, + // etc. + 'apples' => 'appl', + 'bananas' => 'banana', + 'apples_bananas' => 'apples_bananas', + 'apples_bananas.apples_bananas' => 'apples_bananas.apples_bananas', + ); + + $stemmer = new PhutilSearchStemmer(); + foreach ($tests as $input => $expect) { + $stem = $stemmer->stemToken($input); + $this->assertEqual( + $expect, + $stem, + pht('Token stem of "%s".', $input)); + } + } + + public function testStemDocuments() { + $tests = array( + 'The wild boar meandered erratically.' => + 'the wild boar meander errat', + 'Fool me onc, shame on you. Fool me twice, shame on me.' => + 'fool onc shame you twice', + 'Fireball is a seventh-level spell which deals 2d16 points of damage '. + 'in a 1-meter radius around a target.' => + 'firebal seventh level spell which deal 2d16 point damag meter '. + 'radiu around target', + 'apples-bananas' => 'appl banana', + 'apples_bananas' => 'apples_bananas', + 'apples.bananas' => 'apples.bananas', + 'oddly-proportioned' => 'oddli proport', + ); + + $stemmer = new PhutilSearchStemmer(); + foreach ($tests as $input => $expect) { + $stem = $stemmer->stemCorpus($input); + $this->assertEqual( + $expect, + $stem, + pht('Corpus stem of: %s', $input)); + } + } + + +}