Page MenuHomePhabricator

D18634.id44741.diff
No OneTemporary

D18634.id44741.diff

diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php
--- a/src/__phutil_library_map__.php
+++ b/src/__phutil_library_map__.php
@@ -561,6 +561,7 @@
'phutil_utf8_encode_codepoint' => 'utils/utf8.php',
'phutil_utf8_hard_wrap' => 'utils/utf8.php',
'phutil_utf8_hard_wrap_html' => 'utils/utf8.php',
+ 'phutil_utf8_is_cjk' => 'utils/utf8.php',
'phutil_utf8_is_combining_character' => 'utils/utf8.php',
'phutil_utf8_strlen' => 'utils/utf8.php',
'phutil_utf8_strtolower' => 'utils/utf8.php',
diff --git a/src/search/PhutilSearchQueryCompiler.php b/src/search/PhutilSearchQueryCompiler.php
--- a/src/search/PhutilSearchQueryCompiler.php
+++ b/src/search/PhutilSearchQueryCompiler.php
@@ -271,6 +271,8 @@
continue;
}
+ $is_quoted = $token['quoted'];
+
switch ($operator_string) {
case '-':
$operator = self::OPERATOR_NOT;
@@ -281,10 +283,20 @@
case '=':
$operator = self::OPERATOR_EXACT;
break;
- case '':
case '+':
$operator = self::OPERATOR_AND;
break;
+ case '':
+ // See T12995. If this query term contains Chinese, Japanese or
+ // Korean characters, treat the term as a substring term by default.
+ // These languages do not separate words with spaces, so the term
+ // search mode is normally useless.
+ if ($enable_functions && !$is_quoted && phutil_utf8_is_cjk($value)) {
+ $operator = self::OPERATOR_SUBSTRING;
+ } else {
+ $operator = self::OPERATOR_AND;
+ }
+ break;
default:
throw new PhutilSearchQueryCompilerSyntaxException(
pht(
@@ -294,7 +306,7 @@
$result = array(
'operator' => $operator,
- 'quoted' => $token['quoted'],
+ 'quoted' => $is_quoted,
'value' => $value,
);
diff --git a/src/search/__tests__/PhutilSearchQueryCompilerTestCase.php b/src/search/__tests__/PhutilSearchQueryCompilerTestCase.php
--- a/src/search/__tests__/PhutilSearchQueryCompilerTestCase.php
+++ b/src/search/__tests__/PhutilSearchQueryCompilerTestCase.php
@@ -92,6 +92,8 @@
$op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING;
$op_exact = PhutilSearchQueryCompiler::OPERATOR_EXACT;
+ $mao = "\xE7\x8C\xAB";
+
$function_tests = array(
'cat' => array(
array(null, $op_and, 'cat'),
@@ -119,6 +121,18 @@
'~"core and seven years ag"' => array(
array(null, $op_sub, 'core and seven years ag'),
),
+ $mao => array(
+ array(null, $op_sub, $mao),
+ ),
+ '+'.$mao => array(
+ array(null, $op_and, $mao),
+ ),
+ '~'.$mao => array(
+ array(null, $op_sub, $mao),
+ ),
+ '"'.$mao.'"' => array(
+ array(null, $op_and, $mao),
+ ),
);
$this->assertCompileFunctionQueries($function_tests);
diff --git a/src/utils/__tests__/PhutilUTF8TestCase.php b/src/utils/__tests__/PhutilUTF8TestCase.php
--- a/src/utils/__tests__/PhutilUTF8TestCase.php
+++ b/src/utils/__tests__/PhutilUTF8TestCase.php
@@ -609,6 +609,22 @@
$this->assertTrue(true);
}
+ public function testCJK() {
+ $map = array(
+ '' => false,
+ 'a' => false,
+ '.' => false,
+ "\xE2\x98\x83" => false,
+ "\xE5\xA0\xB1" => true,
+ );
+
+ foreach ($map as $input => $expect) {
+ $actual = phutil_utf8_is_cjk($input);
+
+ $this->assertEqual($expect, $actual, pht('CJK: "%s"', $input));
+ }
+ }
+
public function testUTF8BMP() {
$tests = array(
'' => array(
diff --git a/src/utils/utf8.php b/src/utils/utf8.php
--- a/src/utils/utf8.php
+++ b/src/utils/utf8.php
@@ -328,6 +328,64 @@
/**
+ * Test if a string contains Chinese, Japanese, or Korean characters.
+ *
+ * Most languages use spaces to separate words, but these languages do not.
+ *
+ * @param string String to examine, in UTF8.
+ * @return bool True if the string contains Chinese, Japanese, or Korean
+ * characters.
+ */
+function phutil_utf8_is_cjk($string) {
+ $codepoints = phutil_utf8v_codepoints($string);
+
+ foreach ($codepoints as $codepoint) {
+ // CJK Unified Ideographs
+ if ($codepoint >= 0x4E00 && $codepoint <= 0x9FFF) {
+ return true;
+ }
+
+ // CJK Unified Ideographs Extension A
+ if ($codepoint >= 0x3400 && $codepoint <= 0x4DBF) {
+ return true;
+ }
+
+ // CJK Unified Ideographs Extension B
+ if ($codepoint >= 0x20000 && $codepoint <= 0x2A6DF) {
+ return true;
+ }
+
+ // CJK Unified Ideographs Extension C
+ if ($codepoint >= 0x2A700 && $codepoint <= 0x2B73F) {
+ return true;
+ }
+
+ // CJK Unified Ideographs Extension D
+ if ($codepoint >= 0x2B740 && $codepoint <= 0x2B81F) {
+ return true;
+ }
+
+ // CJK Unified Ideographs Extension E
+ if ($codepoint >= 0x2B820 && $codepoint <= 0x2CEAF) {
+ return true;
+ }
+
+ // CJK Unified Ideographs Extension F
+ if ($codepoint >= 0x2CEB0 && $codepoint <= 0x2EBEF) {
+ return true;
+ }
+
+ // CJK Compatibility Ideographs
+ if ($codepoint >= 0xF900 && $codepoint <= 0xFAFF) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+
+/**
* Split a UTF-8 string into an array of characters. Combining characters are
* also split.
*

File Metadata

Mime Type
text/plain
Expires
Fri, Dec 13, 5:53 AM (20 m, 30 s)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
6874329
Default Alt Text
D18634.id44741.diff (5 KB)

Event Timeline