diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php --- a/src/__phutil_library_map__.php +++ b/src/__phutil_library_map__.php @@ -380,6 +380,9 @@ 'PhutilSafeHTMLProducerInterface' => 'markup/PhutilSafeHTMLProducerInterface.php', 'PhutilSafeHTMLTestCase' => 'markup/__tests__/PhutilSafeHTMLTestCase.php', 'PhutilSaturateStdoutDaemon' => 'daemon/torture/PhutilSaturateStdoutDaemon.php', + 'PhutilSearchQueryCompiler' => 'search/PhutilSearchQueryCompiler.php', + 'PhutilSearchQueryCompilerSyntaxException' => 'search/PhutilSearchQueryCompilerSyntaxException.php', + 'PhutilSearchQueryCompilerTestCase' => 'search/__tests__/PhutilSearchQueryCompilerTestCase.php', 'PhutilServiceProfiler' => 'serviceprofiler/PhutilServiceProfiler.php', 'PhutilShellLexer' => 'lexer/PhutilShellLexer.php', 'PhutilShellLexerTestCase' => 'lexer/__tests__/PhutilShellLexerTestCase.php', @@ -981,6 +984,9 @@ 'PhutilSafeHTML' => 'Phobject', 'PhutilSafeHTMLTestCase' => 'PhutilTestCase', 'PhutilSaturateStdoutDaemon' => 'PhutilTortureTestDaemon', + 'PhutilSearchQueryCompiler' => 'Phobject', + 'PhutilSearchQueryCompilerSyntaxException' => 'Exception', + 'PhutilSearchQueryCompilerTestCase' => 'PhutilTestCase', 'PhutilServiceProfiler' => 'Phobject', 'PhutilShellLexer' => 'PhutilLexer', 'PhutilShellLexerTestCase' => 'PhutilTestCase', diff --git a/src/search/PhutilSearchQueryCompiler.php b/src/search/PhutilSearchQueryCompiler.php new file mode 100644 --- /dev/null +++ b/src/search/PhutilSearchQueryCompiler.php @@ -0,0 +1,231 @@ +<()~*:""&|'; + private $query; + + const OPERATOR_NOT = 'not'; + const OPERATOR_AND = 'and'; + + public function setOperators($operators) { + $this->operators = $operators; + return $this; + } + + public function getOperators() { + return $this->operators; + } + + public function setQuery($query) { + $this->query = $query; + return $this; + } + + public function getQuery() { + return $this->query; + } + + public function compileQuery() { + $query = $this->getQuery(); + $tokens = $this->tokenizeQuery($query); + + $result = array(); + foreach ($tokens as $token) { + $result[] = $this->renderToken($token); + } + + $result = array_unique($result); + return implode(' ', $result); + } + + private function tokenizeQuery($query) { + $maximum_bytes = 1024; + + $query_bytes = strlen($query); + if ($query_bytes > $maximum_bytes) { + throw new PhutilSearchQueryCompilerSyntaxException( + pht( + 'Query is too long (%s bytes, maximum is %s bytes).', + new PhutilNumber($query_bytes), + new PhutilNumber($maximum_bytes))); + } + + $query = phutil_utf8v($query); + $length = count($query); + + $mode = 'scan'; + $current_operator = array(); + $current_token = array(); + $is_quoted = false; + $tokens = array(); + for ($ii = 0; $ii < $length; $ii++) { + $character = $query[$ii]; + + if ($mode == 'scan') { + if (preg_match('/^\s\z/u', $character)) { + continue; + } + + $mode = 'operator'; + } + + if ($mode == 'operator') { + if (preg_match('/^\s\z/u', $character)) { + continue; + } + + if (preg_match('/^[+-]\z/', $character)) { + $current_operator[] = $character; + continue; + } + + $mode = 'quote'; + } + + if ($mode == 'quote') { + if (preg_match('/^"\z/', $character)) { + $is_quoted = true; + $mode = 'token'; + continue; + } + + $mode = 'token'; + } + + if ($mode == 'token') { + $capture = false; + $was_quoted = $is_quoted; + if ($is_quoted) { + if (preg_match('/^"\z/', $character)) { + $capture = true; + $mode = 'scan'; + $is_quoted = false; + } + } else { + if (preg_match('/^\s\z/u', $character)) { + $capture = true; + $mode = 'scan'; + } + + if (preg_match('/^"\z/', $character)) { + $capture = true; + $mode = 'token'; + $is_quoted = true; + } + } + + if ($capture) { + $tokens[] = array( + 'operator' => $current_operator, + 'quoted' => $was_quoted, + 'value' => $current_token, + ); + $current_operator = array(); + $current_token = array(); + continue; + } else { + $current_token[] = $character; + } + } + } + + if ($is_quoted) { + throw new PhutilSearchQueryCompilerSyntaxException( + pht( + 'Query contains unmatched double quotes.')); + } + + if ($mode == 'operator') { + throw new PhutilSearchQueryCompilerSyntaxException( + pht( + 'Query contains operator ("%s") with no search term.', + implode('', $current_operator))); + } + + $tokens[] = array( + 'operator' => $current_operator, + 'quoted' => false, + 'value' => $current_token, + ); + + $results = array(); + foreach ($tokens as $token) { + $value = implode('', $token['value']); + $operator_string = implode('', $token['operator']); + + if (!strlen($value)) { + continue; + } + + switch ($operator_string) { + case '-': + $operator = self::OPERATOR_NOT; + break; + case '': + case '+': + $operator = self::OPERATOR_AND; + break; + default: + throw new PhutilSearchQueryCompilerSyntaxException( + pht( + 'Query has an invalid sequence of operators ("%s").', + $operator_string)); + } + + $results[] = array( + 'operator' => $operator, + 'quoted' => $token['quoted'], + 'value' => $value, + ); + } + + return $results; + } + + private function renderToken(array $token) { + $value = $this->quoteToken($token['value']); + $operator = $token['operator']; + $prefix = $this->getOperatorPrefix($operator); + + $value = $prefix.$value; + + return $value; + } + + private function getOperatorPrefix($operator) { + $operators = $this->operators; + + switch ($operator) { + case self::OPERATOR_AND: + $prefix = $operators[0]; + break; + case self::OPERATOR_NOT: + $prefix = $operators[2]; + break; + default: + throw new PhutilSearchQueryCompilerSyntaxException( + pht( + 'Unsupported operator prefix "%s".', + $operator)); + } + + if ($prefix == ' ') { + $prefix = null; + } + + return $prefix; + } + + private function quoteToken($value) { + $operators = $this->operators; + + $open_quote = $this->operators[10]; + $close_quote = $this->operators[11]; + + return $open_quote.$value.$close_quote; + } + + +} diff --git a/src/search/PhutilSearchQueryCompilerSyntaxException.php b/src/search/PhutilSearchQueryCompilerSyntaxException.php new file mode 100644 --- /dev/null +++ b/src/search/PhutilSearchQueryCompilerSyntaxException.php @@ -0,0 +1,4 @@ + '', + 'cat dog' => '+"cat" +"dog"', + 'cat -dog' => '+"cat" -"dog"', + 'cat-dog' => '+"cat-dog"', + + // If there are spaces after an operator, the operator applies to the + // next search term. + 'cat - dog' => '+"cat" -"dog"', + + // Double quotes serve as delimiters even if there is no whitespace + // between terms. + '"cat"dog' => '+"cat" +"dog"', + + // This query is too long. + str_repeat('x', 2048) => false, + + // Multiple operators are not permitted. + '++cat' => false, + '+-cat' => false, + '--cat' => false, + + // Stray operators are not permitted. + '+' => false, + 'cat +' => false, + + // Double quotes must be paired. + '"' => false, + 'cat "' => false, + '"cat' => false, + 'A"' => false, + 'A"B"' => '+"A" +"B"', + ); + + $this->assertCompileQueries($tests); + + // Test that we compile queries correctly if the operators have been + // swapped to use "AND" by default. + $operator_tests = array( + 'cat dog' => '"cat" "dog"', + 'cat -dog' => '"cat" -"dog"', + ); + $this->assertCompileQueries($operator_tests, ' |-><()~*:""&\''); + + + // Test that we compile queries correctly if the quote operators have + // been swapped to differ. + $quote_tests = array( + 'cat dog' => '+[cat] +[dog]', + 'cat -dog' => '+[cat] -[dog]', + ); + $this->assertCompileQueries($quote_tests, '+ -><()~*:[]&|'); + + } + + private function assertCompileQueries(array $tests, $operators = null) { + foreach ($tests as $input => $expect) { + $caught = null; + + try { + $compiler = id(new PhutilSearchQueryCompiler()) + ->setQuery($input); + + if ($operators !== null) { + $compiler->setOperators($operators); + } + + $query = $compiler->compileQuery(); + } catch (PhutilSearchQueryCompilerSyntaxException $ex) { + $caught = $ex; + } + + if ($caught !== null) { + $query = false; + } + + $this->assertEqual( + $expect, + $query, + pht('Compilation of query: %s', $input)); + } + } + +}