Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F18101005
D21129.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
9 KB
Referenced Files
None
Subscribers
None
D21129.diff
View Options
diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php
--- a/src/__phutil_library_map__.php
+++ b/src/__phutil_library_map__.php
@@ -922,6 +922,7 @@
'phutil_hashes_are_identical' => 'utils/utils.php',
'phutil_http_parameter_pair' => 'utils/utils.php',
'phutil_ini_decode' => 'utils/utils.php',
+ 'phutil_is_ascii' => 'utils/utf8.php',
'phutil_is_hiphop_runtime' => 'utils/utils.php',
'phutil_is_interactive' => 'utils/utils.php',
'phutil_is_natural_list' => 'utils/utils.php',
@@ -953,6 +954,7 @@
'phutil_utf8_hard_wrap_html' => 'utils/utf8.php',
'phutil_utf8_is_cjk' => 'utils/utf8.php',
'phutil_utf8_is_combining_character' => 'utils/utf8.php',
+ 'phutil_utf8_normalize_for_search' => 'utils/utf8.php',
'phutil_utf8_strlen' => 'utils/utf8.php',
'phutil_utf8_strtolower' => 'utils/utf8.php',
'phutil_utf8_strtoupper' => 'utils/utf8.php',
diff --git a/src/utils/__tests__/PhutilUTF8TestCase.php b/src/utils/__tests__/PhutilUTF8TestCase.php
--- a/src/utils/__tests__/PhutilUTF8TestCase.php
+++ b/src/utils/__tests__/PhutilUTF8TestCase.php
@@ -824,4 +824,48 @@
phutil_set_system_locale($original_locale);
}
+ public function testUTF8SearchNormalize() {
+ $combining_acute = "\xCC\x81";
+
+ $latin_small_e_with_acute = "\xC3\xA9";
+ $latin_capital_e_with_acute = "\xC3\x89";
+
+ $map = array(
+ 'e' => 'e',
+ 'E' => 'e',
+
+ 'e'.$combining_acute => 'e',
+ 'E'.$combining_acute => 'e',
+ $latin_small_e_with_acute => 'e',
+ $latin_capital_e_with_acute => 'e',
+
+ "jalape\xC3\xB1o" => 'jalapeno',
+ "JALAPE\xC3\x91O" => 'jalapeno',
+ "jalapen\xCC\x83o" => 'jalapeno',
+ "JALAPEN\xCC\x83O" => 'jalapeno',
+
+ // Currently, we normalize a latin character followed by any sequence
+ // of combining characters into the initial latin character, so "zalgo"
+ // text with a bunch of nonsensical combining marks normalizes
+ // reasonably. This is not really required and not necessarily a good
+ // rule, it was just easy to implement with available primitives.
+
+ // This is a "q" with three combining circles, and not a legitimate
+ // character in any human language at time of writing.
+ "q\xE2\x83\x9D\xE2\x83\x9D\xE2\x83\x9D" => 'q',
+ );
+
+ foreach ($map as $input => $expect) {
+ $actual = phutil_utf8_normalize_for_search($input);
+
+ $this->assertEqual(
+ $expect,
+ $actual,
+ pht(
+ 'Normalize for Search of: %s (%s)',
+ $input,
+ phutil_loggable_string($input)));
+ }
+ }
+
}
diff --git a/src/utils/utf8.php b/src/utils/utf8.php
--- a/src/utils/utf8.php
+++ b/src/utils/utf8.php
@@ -120,7 +120,7 @@
function phutil_is_utf8_slowly($string, $only_bmp = false) {
// First, check the common case of normal ASCII strings. We're fine if
// the string contains no bytes larger than 127.
- if (preg_match('/^[\x01-\x7F]+\z/', $string)) {
+ if (phutil_is_ascii($string)) {
return true;
}
@@ -314,7 +314,7 @@
$string = preg_replace("/\x1B\[\d*m/", '', $string);
// In the common case of an ASCII string, just return the string length.
- if (preg_match('/^[\x01-\x7F]*\z/', $string)) {
+ if (phutil_is_ascii($string)) {
return strlen($string);
}
@@ -976,3 +976,45 @@
$locale));
}
}
+
+function phutil_is_ascii($value) {
+ return (bool)preg_match('/^[\x01-\x7F]*\z/', $value);
+}
+
+function phutil_utf8_normalize_for_search($value) {
+ // See T13501. This function does not work properly for many inputs and only
+ // attempts to roughly normalize the most common latin sequences.
+
+ if (!phutil_is_ascii($value)) {
+ static $map;
+ if ($map === null) {
+ $arcanist_root = dirname(phutil_get_library_root('arcanist'));
+ $utf8_root = $arcanist_root.'/support/utf8/';
+ $map = require_once $utf8_root.'search-normalization-map.php';
+ }
+
+ $sequences = phutil_utf8v_combined($value);
+ foreach ($sequences as $key => $bytes) {
+ // If this sequence of characters has an explicit normal form in the map,
+ // replace it.
+ if (isset($map[$bytes])) {
+ $sequences[$key] = $map[$bytes];
+ continue;
+ }
+
+ // If this sequence of characters does not have an explicit normal form,
+ // but the base character is a low-ASCII latin character, assume that
+ // discarding all the combining characters produces a reasonable
+ // normalization.
+ if (ord($bytes[0]) < 0x7F) {
+ $sequences[$key] = $bytes[0];
+ continue;
+ }
+ }
+ $value = implode('', $sequences);
+ }
+
+ $value = phutil_utf8_strtolower($value);
+
+ return $value;
+}
diff --git a/support/utf8/search-normalization-map.php b/support/utf8/search-normalization-map.php
new file mode 100644
--- /dev/null
+++ b/support/utf8/search-normalization-map.php
@@ -0,0 +1,203 @@
+<?php
+
+// See T13501. This is a very rough first draft that just aims to get these
+// rules approximately right for some of the most common latin inputs.
+
+return array(
+ "\xC2\xAA" => 'a',
+ "\xC2\xBA" => 'o',
+ "\xC3\x80" => 'A',
+ "\xC3\x81" => 'A',
+ "\xC3\x82" => 'A',
+ "\xC3\x83" => 'A',
+ "\xC3\x84" => 'A',
+ "\xC3\x85" => 'A',
+ "\xC3\x86" => 'AE',
+ "\xC3\x87" => 'C',
+ "\xC3\x88" => 'E',
+ "\xC3\x89" => 'E',
+ "\xC3\x8A" => 'E',
+ "\xC3\x8B" => 'E',
+ "\xC3\x8C" => 'I',
+ "\xC3\x8D" => 'I',
+ "\xC3\x8E" => 'I',
+ "\xC3\x8F" => 'I',
+ "\xC3\x90" => 'D',
+ "\xC3\x91" => 'N',
+ "\xC3\x92" => 'O',
+ "\xC3\x93" => 'O',
+ "\xC3\x94" => 'O',
+ "\xC3\x95" => 'O',
+ "\xC3\x96" => 'O',
+ "\xC3\x99" => 'U',
+ "\xC3\x9A" => 'U',
+ "\xC3\x9B" => 'U',
+ "\xC3\x9C" => 'U',
+ "\xC3\x9D" => 'Y',
+ "\xC3\x9E" => 'TH',
+ "\xC3\x9F" => 's',
+ "\xC3\xA0" => 'a',
+ "\xC3\xA1" => 'a',
+ "\xC3\xA2" => 'a',
+ "\xC3\xA3" => 'a',
+ "\xC3\xA4" => 'a',
+ "\xC3\xA5" => 'a',
+ "\xC3\xA6" => 'ae',
+ "\xC3\xA7" => 'c',
+ "\xC3\xA8" => 'e',
+ "\xC3\xA9" => 'e',
+ "\xC3\xAA" => 'e',
+ "\xC3\xAB" => 'e',
+ "\xC3\xAC" => 'i',
+ "\xC3\xAD" => 'i',
+ "\xC3\xAE" => 'i',
+ "\xC3\xAF" => 'i',
+ "\xC3\xB0" => 'd',
+ "\xC3\xB1" => 'n',
+ "\xC3\xB2" => 'o',
+ "\xC3\xB3" => 'o',
+ "\xC3\xB4" => 'o',
+ "\xC3\xB5" => 'o',
+ "\xC3\xB6" => 'o',
+ "\xC3\xB8" => 'o',
+ "\xC3\xB9" => 'u',
+ "\xC3\xBA" => 'u',
+ "\xC3\xBB" => 'u',
+ "\xC3\xBC" => 'u',
+ "\xC3\xBD" => 'y',
+ "\xC3\xBE" => 'th',
+ "\xC3\xBF" => 'y',
+ "\xC3\x98" => 'O',
+ "\xC4\x80" => 'A',
+ "\xC4\x81" => 'a',
+ "\xC4\x82" => 'A',
+ "\xC4\x83" => 'a',
+ "\xC4\x84" => 'A',
+ "\xC4\x85" => 'a',
+ "\xC4\x86" => 'C',
+ "\xC4\x87" => 'c',
+ "\xC4\x88" => 'C',
+ "\xC4\x89" => 'c',
+ "\xC4\x8A" => 'C',
+ "\xC4\x8B" => 'c',
+ "\xC4\x8C" => 'C',
+ "\xC4\x8D" => 'c',
+ "\xC4\x8E" => 'D',
+ "\xC4\x8F" => 'd',
+ "\xC4\x90" => 'D',
+ "\xC4\x91" => 'd',
+ "\xC4\x92" => 'E',
+ "\xC4\x93" => 'e',
+ "\xC4\x94" => 'E',
+ "\xC4\x95" => 'e',
+ "\xC4\x96" => 'E',
+ "\xC4\x97" => 'e',
+ "\xC4\x98" => 'E',
+ "\xC4\x99" => 'e',
+ "\xC4\x9A" => 'E',
+ "\xC4\x9B" => 'e',
+ "\xC4\x9C" => 'G',
+ "\xC4\x9D" => 'g',
+ "\xC4\x9E" => 'G',
+ "\xC4\x9F" => 'g',
+ "\xC4\xA0" => 'G',
+ "\xC4\xA1" => 'g',
+ "\xC4\xA2" => 'G',
+ "\xC4\xA3" => 'g',
+ "\xC4\xA4" => 'H',
+ "\xC4\xA5" => 'h',
+ "\xC4\xA6" => 'H',
+ "\xC4\xA7" => 'h',
+ "\xC4\xA8" => 'I',
+ "\xC4\xA9" => 'i',
+ "\xC4\xAA" => 'I',
+ "\xC4\xAB" => 'i',
+ "\xC4\xAC" => 'I',
+ "\xC4\xAD" => 'i',
+ "\xC4\xAE" => 'I',
+ "\xC4\xAF" => 'i',
+ "\xC4\xB0" => 'I',
+ "\xC4\xB1" => 'i',
+ "\xC4\xB2" => 'IJ',
+ "\xC4\xB3" => 'ij',
+ "\xC4\xB4" => 'J',
+ "\xC4\xB5" => 'j',
+ "\xC4\xB6" => 'K',
+ "\xC4\xB7" => 'k',
+ "\xC4\xB8" => 'k',
+ "\xC4\xB9" => 'L',
+ "\xC4\xBA" => 'l',
+ "\xC4\xBB" => 'L',
+ "\xC4\xBC" => 'l',
+ "\xC4\xBD" => 'L',
+ "\xC4\xBE" => 'l',
+ "\xC4\xBF" => 'L',
+ "\xC5\x80" => 'l',
+ "\xC5\x81" => 'L',
+ "\xC5\x82" => 'l',
+ "\xC5\x83" => 'N',
+ "\xC5\x84" => 'n',
+ "\xC5\x85" => 'N',
+ "\xC5\x86" => 'n',
+ "\xC5\x87" => 'N',
+ "\xC5\x88" => 'n',
+ "\xC5\x89" => 'n',
+ "\xC5\x8A" => 'N',
+ "\xC5\x8B" => 'n',
+ "\xC5\x8C" => 'O',
+ "\xC5\x8D" => 'o',
+ "\xC5\x8E" => 'O',
+ "\xC5\x8F" => 'o',
+ "\xC5\x90" => 'O',
+ "\xC5\x91" => 'o',
+ "\xC5\x92" => 'OE',
+ "\xC5\x93" => 'oe',
+ "\xC5\x94" => 'R',
+ "\xC5\x95" => 'r',
+ "\xC5\x96" => 'R',
+ "\xC5\x97" => 'r',
+ "\xC5\x98" => 'R',
+ "\xC5\x99" => 'r',
+ "\xC5\x9A" => 'S',
+ "\xC5\x9B" => 's',
+ "\xC5\x9C" => 'S',
+ "\xC5\x9D" => 's',
+ "\xC5\x9E" => 'S',
+ "\xC5\x9F" => 's',
+ "\xC5\xA0" => 'S',
+ "\xC5\xA1" => 's',
+ "\xC5\xA2" => 'T',
+ "\xC5\xA3" => 't',
+ "\xC5\xA4" => 'T',
+ "\xC5\xA5" => 't',
+ "\xC5\xA6" => 'T',
+ "\xC5\xA7" => 't',
+ "\xC5\xA8" => 'U',
+ "\xC5\xA9" => 'u',
+ "\xC5\xAA" => 'U',
+ "\xC5\xAB" => 'u',
+ "\xC5\xAC" => 'U',
+ "\xC5\xAD" => 'u',
+ "\xC5\xAE" => 'U',
+ "\xC5\xAF" => 'u',
+ "\xC5\xB0" => 'U',
+ "\xC5\xB1" => 'u',
+ "\xC5\xB2" => 'U',
+ "\xC5\xB3" => 'u',
+ "\xC5\xB4" => 'W',
+ "\xC5\xB5" => 'w',
+ "\xC5\xB6" => 'Y',
+ "\xC5\xB7" => 'y',
+ "\xC5\xB8" => 'Y',
+ "\xC5\xB9" => 'Z',
+ "\xC5\xBA" => 'z',
+ "\xC5\xBB" => 'Z',
+ "\xC5\xBC" => 'z',
+ "\xC5\xBD" => 'Z',
+ "\xC5\xBE" => 'z',
+ "\xC5\xBF" => 's',
+ "\xC8\x98" => 'S',
+ "\xC8\x99" => 's',
+ "\xC8\x9A" => 'T',
+ "\xC8\x9B" => 't',
+);
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Aug 10 2025, 12:15 PM (11 w, 6 d ago)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
8895720
Default Alt Text
D21129.diff (9 KB)
Attached To
Mode
D21129: (Early Draft) Add a function to normalize UTF8 for search indexing
Attached
Detach File
Event Timeline
Log In to Comment