Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F14233997
D18634.id44741.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
5 KB
Referenced Files
None
Subscribers
None
D18634.id44741.diff
View Options
diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php
--- a/src/__phutil_library_map__.php
+++ b/src/__phutil_library_map__.php
@@ -561,6 +561,7 @@
'phutil_utf8_encode_codepoint' => 'utils/utf8.php',
'phutil_utf8_hard_wrap' => 'utils/utf8.php',
'phutil_utf8_hard_wrap_html' => 'utils/utf8.php',
+ 'phutil_utf8_is_cjk' => 'utils/utf8.php',
'phutil_utf8_is_combining_character' => 'utils/utf8.php',
'phutil_utf8_strlen' => 'utils/utf8.php',
'phutil_utf8_strtolower' => 'utils/utf8.php',
diff --git a/src/search/PhutilSearchQueryCompiler.php b/src/search/PhutilSearchQueryCompiler.php
--- a/src/search/PhutilSearchQueryCompiler.php
+++ b/src/search/PhutilSearchQueryCompiler.php
@@ -271,6 +271,8 @@
continue;
}
+ $is_quoted = $token['quoted'];
+
switch ($operator_string) {
case '-':
$operator = self::OPERATOR_NOT;
@@ -281,10 +283,20 @@
case '=':
$operator = self::OPERATOR_EXACT;
break;
- case '':
case '+':
$operator = self::OPERATOR_AND;
break;
+ case '':
+ // See T12995. If this query term contains Chinese, Japanese or
+ // Korean characters, treat the term as a substring term by default.
+ // These languages do not separate words with spaces, so the term
+ // search mode is normally useless.
+ if ($enable_functions && !$is_quoted && phutil_utf8_is_cjk($value)) {
+ $operator = self::OPERATOR_SUBSTRING;
+ } else {
+ $operator = self::OPERATOR_AND;
+ }
+ break;
default:
throw new PhutilSearchQueryCompilerSyntaxException(
pht(
@@ -294,7 +306,7 @@
$result = array(
'operator' => $operator,
- 'quoted' => $token['quoted'],
+ 'quoted' => $is_quoted,
'value' => $value,
);
diff --git a/src/search/__tests__/PhutilSearchQueryCompilerTestCase.php b/src/search/__tests__/PhutilSearchQueryCompilerTestCase.php
--- a/src/search/__tests__/PhutilSearchQueryCompilerTestCase.php
+++ b/src/search/__tests__/PhutilSearchQueryCompilerTestCase.php
@@ -92,6 +92,8 @@
$op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING;
$op_exact = PhutilSearchQueryCompiler::OPERATOR_EXACT;
+ $mao = "\xE7\x8C\xAB";
+
$function_tests = array(
'cat' => array(
array(null, $op_and, 'cat'),
@@ -119,6 +121,18 @@
'~"core and seven years ag"' => array(
array(null, $op_sub, 'core and seven years ag'),
),
+ $mao => array(
+ array(null, $op_sub, $mao),
+ ),
+ '+'.$mao => array(
+ array(null, $op_and, $mao),
+ ),
+ '~'.$mao => array(
+ array(null, $op_sub, $mao),
+ ),
+ '"'.$mao.'"' => array(
+ array(null, $op_and, $mao),
+ ),
);
$this->assertCompileFunctionQueries($function_tests);
diff --git a/src/utils/__tests__/PhutilUTF8TestCase.php b/src/utils/__tests__/PhutilUTF8TestCase.php
--- a/src/utils/__tests__/PhutilUTF8TestCase.php
+++ b/src/utils/__tests__/PhutilUTF8TestCase.php
@@ -609,6 +609,22 @@
$this->assertTrue(true);
}
+ public function testCJK() {
+ $map = array(
+ '' => false,
+ 'a' => false,
+ '.' => false,
+ "\xE2\x98\x83" => false,
+ "\xE5\xA0\xB1" => true,
+ );
+
+ foreach ($map as $input => $expect) {
+ $actual = phutil_utf8_is_cjk($input);
+
+ $this->assertEqual($expect, $actual, pht('CJK: "%s"', $input));
+ }
+ }
+
public function testUTF8BMP() {
$tests = array(
'' => array(
diff --git a/src/utils/utf8.php b/src/utils/utf8.php
--- a/src/utils/utf8.php
+++ b/src/utils/utf8.php
@@ -328,6 +328,64 @@
/**
+ * Test if a string contains Chinese, Japanese, or Korean characters.
+ *
+ * Most languages use spaces to separate words, but these languages do not.
+ *
+ * @param string String to examine, in UTF8.
+ * @return bool True if the string contains Chinese, Japanese, or Korean
+ * characters.
+ */
+function phutil_utf8_is_cjk($string) {
+ $codepoints = phutil_utf8v_codepoints($string);
+
+ foreach ($codepoints as $codepoint) {
+ // CJK Unified Ideographs
+ if ($codepoint >= 0x4E00 && $codepoint <= 0x9FFF) {
+ return true;
+ }
+
+ // CJK Unified Ideographs Extension A
+ if ($codepoint >= 0x3400 && $codepoint <= 0x4DBF) {
+ return true;
+ }
+
+ // CJK Unified Ideographs Extension B
+ if ($codepoint >= 0x20000 && $codepoint <= 0x2A6DF) {
+ return true;
+ }
+
+ // CJK Unified Ideographs Extension C
+ if ($codepoint >= 0x2A700 && $codepoint <= 0x2B73F) {
+ return true;
+ }
+
+ // CJK Unified Ideographs Extension D
+ if ($codepoint >= 0x2B740 && $codepoint <= 0x2B81F) {
+ return true;
+ }
+
+ // CJK Unified Ideographs Extension E
+ if ($codepoint >= 0x2B820 && $codepoint <= 0x2CEAF) {
+ return true;
+ }
+
+ // CJK Unified Ideographs Extension F
+ if ($codepoint >= 0x2CEB0 && $codepoint <= 0x2EBEF) {
+ return true;
+ }
+
+ // CJK Compatibility Ideographs
+ if ($codepoint >= 0xF900 && $codepoint <= 0xFAFF) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+
+/**
* Split a UTF-8 string into an array of characters. Combining characters are
* also split.
*
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Fri, Dec 13, 5:53 AM (20 m, 30 s)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
6874329
Default Alt Text
D18634.id44741.diff (5 KB)
Attached To
Mode
D18634: Default CJK query terms to "substring" mode, not "term" mode
Attached
Detach File
Event Timeline
Log In to Comment