diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php --- a/src/__phutil_library_map__.php +++ b/src/__phutil_library_map__.php @@ -561,6 +561,7 @@ 'phutil_utf8_encode_codepoint' => 'utils/utf8.php', 'phutil_utf8_hard_wrap' => 'utils/utf8.php', 'phutil_utf8_hard_wrap_html' => 'utils/utf8.php', + 'phutil_utf8_is_cjk' => 'utils/utf8.php', 'phutil_utf8_is_combining_character' => 'utils/utf8.php', 'phutil_utf8_strlen' => 'utils/utf8.php', 'phutil_utf8_strtolower' => 'utils/utf8.php', diff --git a/src/search/PhutilSearchQueryCompiler.php b/src/search/PhutilSearchQueryCompiler.php --- a/src/search/PhutilSearchQueryCompiler.php +++ b/src/search/PhutilSearchQueryCompiler.php @@ -271,6 +271,8 @@ continue; } + $is_quoted = $token['quoted']; + switch ($operator_string) { case '-': $operator = self::OPERATOR_NOT; @@ -281,10 +283,20 @@ case '=': $operator = self::OPERATOR_EXACT; break; - case '': case '+': $operator = self::OPERATOR_AND; break; + case '': + // See T12995. If this query term contains Chinese, Japanese or + // Korean characters, treat the term as a substring term by default. + // These languages do not separate words with spaces, so the term + // search mode is normally useless. + if ($enable_functions && !$is_quoted && phutil_utf8_is_cjk($value)) { + $operator = self::OPERATOR_SUBSTRING; + } else { + $operator = self::OPERATOR_AND; + } + break; default: throw new PhutilSearchQueryCompilerSyntaxException( pht( @@ -294,7 +306,7 @@ $result = array( 'operator' => $operator, - 'quoted' => $token['quoted'], + 'quoted' => $is_quoted, 'value' => $value, ); diff --git a/src/search/__tests__/PhutilSearchQueryCompilerTestCase.php b/src/search/__tests__/PhutilSearchQueryCompilerTestCase.php --- a/src/search/__tests__/PhutilSearchQueryCompilerTestCase.php +++ b/src/search/__tests__/PhutilSearchQueryCompilerTestCase.php @@ -92,6 +92,8 @@ $op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING; $op_exact = PhutilSearchQueryCompiler::OPERATOR_EXACT; + $mao = "\xE7\x8C\xAB"; + $function_tests = array( 'cat' => array( array(null, $op_and, 'cat'), @@ -119,6 +121,18 @@ '~"core and seven years ag"' => array( array(null, $op_sub, 'core and seven years ag'), ), + $mao => array( + array(null, $op_sub, $mao), + ), + '+'.$mao => array( + array(null, $op_and, $mao), + ), + '~'.$mao => array( + array(null, $op_sub, $mao), + ), + '"'.$mao.'"' => array( + array(null, $op_and, $mao), + ), ); $this->assertCompileFunctionQueries($function_tests); diff --git a/src/utils/__tests__/PhutilUTF8TestCase.php b/src/utils/__tests__/PhutilUTF8TestCase.php --- a/src/utils/__tests__/PhutilUTF8TestCase.php +++ b/src/utils/__tests__/PhutilUTF8TestCase.php @@ -609,6 +609,22 @@ $this->assertTrue(true); } + public function testCJK() { + $map = array( + '' => false, + 'a' => false, + '.' => false, + "\xE2\x98\x83" => false, + "\xE5\xA0\xB1" => true, + ); + + foreach ($map as $input => $expect) { + $actual = phutil_utf8_is_cjk($input); + + $this->assertEqual($expect, $actual, pht('CJK: "%s"', $input)); + } + } + public function testUTF8BMP() { $tests = array( '' => array( diff --git a/src/utils/utf8.php b/src/utils/utf8.php --- a/src/utils/utf8.php +++ b/src/utils/utf8.php @@ -327,6 +327,64 @@ } +/** + * Test if a string contains Chinese, Japanese, or Korean characters. + * + * Most languages use spaces to separate words, but these languages do not. + * + * @param string String to examine, in UTF8. + * @return bool True if the string contains Chinese, Japanese, or Korean + * characters. + */ +function phutil_utf8_is_cjk($string) { + $codepoints = phutil_utf8v_codepoints($string); + + foreach ($codepoints as $codepoint) { + // CJK Unified Ideographs + if ($codepoint >= 0x4E00 && $codepoint <= 0x9FFF) { + return true; + } + + // CJK Unified Ideographs Extension A + if ($codepoint >= 0x3400 && $codepoint <= 0x4DBF) { + return true; + } + + // CJK Unified Ideographs Extension B + if ($codepoint >= 0x20000 && $codepoint <= 0x2A6DF) { + return true; + } + + // CJK Unified Ideographs Extension C + if ($codepoint >= 0x2A700 && $codepoint <= 0x2B73F) { + return true; + } + + // CJK Unified Ideographs Extension D + if ($codepoint >= 0x2B740 && $codepoint <= 0x2B81F) { + return true; + } + + // CJK Unified Ideographs Extension E + if ($codepoint >= 0x2B820 && $codepoint <= 0x2CEAF) { + return true; + } + + // CJK Unified Ideographs Extension F + if ($codepoint >= 0x2CEB0 && $codepoint <= 0x2EBEF) { + return true; + } + + // CJK Compatibility Ideographs + if ($codepoint >= 0xF900 && $codepoint <= 0xFAFF) { + return true; + } + } + + return false; +} + + /** * Split a UTF-8 string into an array of characters. Combining characters are * also split.