Page MenuHomePhabricator

D21129.id50314.diff
No OneTemporary

D21129.id50314.diff

diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php
--- a/src/__phutil_library_map__.php
+++ b/src/__phutil_library_map__.php
@@ -922,6 +922,7 @@
'phutil_hashes_are_identical' => 'utils/utils.php',
'phutil_http_parameter_pair' => 'utils/utils.php',
'phutil_ini_decode' => 'utils/utils.php',
+ 'phutil_is_ascii' => 'utils/utf8.php',
'phutil_is_hiphop_runtime' => 'utils/utils.php',
'phutil_is_interactive' => 'utils/utils.php',
'phutil_is_natural_list' => 'utils/utils.php',
@@ -953,6 +954,7 @@
'phutil_utf8_hard_wrap_html' => 'utils/utf8.php',
'phutil_utf8_is_cjk' => 'utils/utf8.php',
'phutil_utf8_is_combining_character' => 'utils/utf8.php',
+ 'phutil_utf8_normalize_for_search' => 'utils/utf8.php',
'phutil_utf8_strlen' => 'utils/utf8.php',
'phutil_utf8_strtolower' => 'utils/utf8.php',
'phutil_utf8_strtoupper' => 'utils/utf8.php',
diff --git a/src/utils/__tests__/PhutilUTF8TestCase.php b/src/utils/__tests__/PhutilUTF8TestCase.php
--- a/src/utils/__tests__/PhutilUTF8TestCase.php
+++ b/src/utils/__tests__/PhutilUTF8TestCase.php
@@ -824,4 +824,48 @@
phutil_set_system_locale($original_locale);
}
+ public function testUTF8SearchNormalize() {
+ $combining_acute = "\xCC\x81";
+
+ $latin_small_e_with_acute = "\xC3\xA9";
+ $latin_capital_e_with_acute = "\xC3\x89";
+
+ $map = array(
+ 'e' => 'e',
+ 'E' => 'e',
+
+ 'e'.$combining_acute => 'e',
+ 'E'.$combining_acute => 'e',
+ $latin_small_e_with_acute => 'e',
+ $latin_capital_e_with_acute => 'e',
+
+ "jalape\xC3\xB1o" => 'jalapeno',
+ "JALAPE\xC3\x91O" => 'jalapeno',
+ "jalapen\xCC\x83o" => 'jalapeno',
+ "JALAPEN\xCC\x83O" => 'jalapeno',
+
+ // Currently, we normalize a latin character followed by any sequence
+ // of combining characters into the initial latin character, so "zalgo"
+ // text with a bunch of nonsensical combining marks normalizes
+ // reasonably. This is not really required and not necessarily a good
+ // rule, it was just easy to implement with available primitives.
+
+ // This is a "q" with three combining circles, and not a legitimate
+ // character in any human language at time of writing.
+ "q\xE2\x83\x9D\xE2\x83\x9D\xE2\x83\x9D" => 'q',
+ );
+
+ foreach ($map as $input => $expect) {
+ $actual = phutil_utf8_normalize_for_search($input);
+
+ $this->assertEqual(
+ $expect,
+ $actual,
+ pht(
+ 'Normalize for Search of: %s (%s)',
+ $input,
+ phutil_loggable_string($input)));
+ }
+ }
+
}
diff --git a/src/utils/utf8.php b/src/utils/utf8.php
--- a/src/utils/utf8.php
+++ b/src/utils/utf8.php
@@ -120,7 +120,7 @@
function phutil_is_utf8_slowly($string, $only_bmp = false) {
// First, check the common case of normal ASCII strings. We're fine if
// the string contains no bytes larger than 127.
- if (preg_match('/^[\x01-\x7F]+\z/', $string)) {
+ if (phutil_is_ascii($string)) {
return true;
}
@@ -314,7 +314,7 @@
$string = preg_replace("/\x1B\[\d*m/", '', $string);
// In the common case of an ASCII string, just return the string length.
- if (preg_match('/^[\x01-\x7F]*\z/', $string)) {
+ if (phutil_is_ascii($string)) {
return strlen($string);
}
@@ -976,3 +976,45 @@
$locale));
}
}
+
+function phutil_is_ascii($value) {
+ return (bool)preg_match('/^[\x01-\x7F]*\z/', $value);
+}
+
+function phutil_utf8_normalize_for_search($value) {
+ // See T13501. This function does not work properly for many inputs and only
+ // attempts to roughly normalize the most common latin sequences.
+
+ if (!phutil_is_ascii($value)) {
+ static $map;
+ if ($map === null) {
+ $arcanist_root = dirname(phutil_get_library_root('arcanist'));
+ $utf8_root = $arcanist_root.'/support/utf8/';
+ $map = require_once $utf8_root.'search-normalization-map.php';
+ }
+
+ $sequences = phutil_utf8v_combined($value);
+ foreach ($sequences as $key => $bytes) {
+ // If this sequence of characters has an explicit normal form in the map,
+ // replace it.
+ if (isset($map[$bytes])) {
+ $sequences[$key] = $map[$bytes];
+ continue;
+ }
+
+ // If this sequence of characters does not have an explicit normal form,
+ // but the base character is a low-ASCII latin character, assume that
+ // discarding all the combining characters produces a reasonable
+ // normalization.
+ if (ord($bytes[0]) < 0x7F) {
+ $sequences[$key] = $bytes[0];
+ continue;
+ }
+ }
+ $value = implode('', $sequences);
+ }
+
+ $value = phutil_utf8_strtolower($value);
+
+ return $value;
+}
diff --git a/support/utf8/search-normalization-map.php b/support/utf8/search-normalization-map.php
new file mode 100644
--- /dev/null
+++ b/support/utf8/search-normalization-map.php
@@ -0,0 +1,203 @@
+<?php
+
+// See T13501. This is a very rough first draft that just aims to get these
+// rules approximately right for some of the most common latin inputs.
+
+return array(
+ "\xC2\xAA" => 'a',
+ "\xC2\xBA" => 'o',
+ "\xC3\x80" => 'A',
+ "\xC3\x81" => 'A',
+ "\xC3\x82" => 'A',
+ "\xC3\x83" => 'A',
+ "\xC3\x84" => 'A',
+ "\xC3\x85" => 'A',
+ "\xC3\x86" => 'AE',
+ "\xC3\x87" => 'C',
+ "\xC3\x88" => 'E',
+ "\xC3\x89" => 'E',
+ "\xC3\x8A" => 'E',
+ "\xC3\x8B" => 'E',
+ "\xC3\x8C" => 'I',
+ "\xC3\x8D" => 'I',
+ "\xC3\x8E" => 'I',
+ "\xC3\x8F" => 'I',
+ "\xC3\x90" => 'D',
+ "\xC3\x91" => 'N',
+ "\xC3\x92" => 'O',
+ "\xC3\x93" => 'O',
+ "\xC3\x94" => 'O',
+ "\xC3\x95" => 'O',
+ "\xC3\x96" => 'O',
+ "\xC3\x99" => 'U',
+ "\xC3\x9A" => 'U',
+ "\xC3\x9B" => 'U',
+ "\xC3\x9C" => 'U',
+ "\xC3\x9D" => 'Y',
+ "\xC3\x9E" => 'TH',
+ "\xC3\x9F" => 's',
+ "\xC3\xA0" => 'a',
+ "\xC3\xA1" => 'a',
+ "\xC3\xA2" => 'a',
+ "\xC3\xA3" => 'a',
+ "\xC3\xA4" => 'a',
+ "\xC3\xA5" => 'a',
+ "\xC3\xA6" => 'ae',
+ "\xC3\xA7" => 'c',
+ "\xC3\xA8" => 'e',
+ "\xC3\xA9" => 'e',
+ "\xC3\xAA" => 'e',
+ "\xC3\xAB" => 'e',
+ "\xC3\xAC" => 'i',
+ "\xC3\xAD" => 'i',
+ "\xC3\xAE" => 'i',
+ "\xC3\xAF" => 'i',
+ "\xC3\xB0" => 'd',
+ "\xC3\xB1" => 'n',
+ "\xC3\xB2" => 'o',
+ "\xC3\xB3" => 'o',
+ "\xC3\xB4" => 'o',
+ "\xC3\xB5" => 'o',
+ "\xC3\xB6" => 'o',
+ "\xC3\xB8" => 'o',
+ "\xC3\xB9" => 'u',
+ "\xC3\xBA" => 'u',
+ "\xC3\xBB" => 'u',
+ "\xC3\xBC" => 'u',
+ "\xC3\xBD" => 'y',
+ "\xC3\xBE" => 'th',
+ "\xC3\xBF" => 'y',
+ "\xC3\x98" => 'O',
+ "\xC4\x80" => 'A',
+ "\xC4\x81" => 'a',
+ "\xC4\x82" => 'A',
+ "\xC4\x83" => 'a',
+ "\xC4\x84" => 'A',
+ "\xC4\x85" => 'a',
+ "\xC4\x86" => 'C',
+ "\xC4\x87" => 'c',
+ "\xC4\x88" => 'C',
+ "\xC4\x89" => 'c',
+ "\xC4\x8A" => 'C',
+ "\xC4\x8B" => 'c',
+ "\xC4\x8C" => 'C',
+ "\xC4\x8D" => 'c',
+ "\xC4\x8E" => 'D',
+ "\xC4\x8F" => 'd',
+ "\xC4\x90" => 'D',
+ "\xC4\x91" => 'd',
+ "\xC4\x92" => 'E',
+ "\xC4\x93" => 'e',
+ "\xC4\x94" => 'E',
+ "\xC4\x95" => 'e',
+ "\xC4\x96" => 'E',
+ "\xC4\x97" => 'e',
+ "\xC4\x98" => 'E',
+ "\xC4\x99" => 'e',
+ "\xC4\x9A" => 'E',
+ "\xC4\x9B" => 'e',
+ "\xC4\x9C" => 'G',
+ "\xC4\x9D" => 'g',
+ "\xC4\x9E" => 'G',
+ "\xC4\x9F" => 'g',
+ "\xC4\xA0" => 'G',
+ "\xC4\xA1" => 'g',
+ "\xC4\xA2" => 'G',
+ "\xC4\xA3" => 'g',
+ "\xC4\xA4" => 'H',
+ "\xC4\xA5" => 'h',
+ "\xC4\xA6" => 'H',
+ "\xC4\xA7" => 'h',
+ "\xC4\xA8" => 'I',
+ "\xC4\xA9" => 'i',
+ "\xC4\xAA" => 'I',
+ "\xC4\xAB" => 'i',
+ "\xC4\xAC" => 'I',
+ "\xC4\xAD" => 'i',
+ "\xC4\xAE" => 'I',
+ "\xC4\xAF" => 'i',
+ "\xC4\xB0" => 'I',
+ "\xC4\xB1" => 'i',
+ "\xC4\xB2" => 'IJ',
+ "\xC4\xB3" => 'ij',
+ "\xC4\xB4" => 'J',
+ "\xC4\xB5" => 'j',
+ "\xC4\xB6" => 'K',
+ "\xC4\xB7" => 'k',
+ "\xC4\xB8" => 'k',
+ "\xC4\xB9" => 'L',
+ "\xC4\xBA" => 'l',
+ "\xC4\xBB" => 'L',
+ "\xC4\xBC" => 'l',
+ "\xC4\xBD" => 'L',
+ "\xC4\xBE" => 'l',
+ "\xC4\xBF" => 'L',
+ "\xC5\x80" => 'l',
+ "\xC5\x81" => 'L',
+ "\xC5\x82" => 'l',
+ "\xC5\x83" => 'N',
+ "\xC5\x84" => 'n',
+ "\xC5\x85" => 'N',
+ "\xC5\x86" => 'n',
+ "\xC5\x87" => 'N',
+ "\xC5\x88" => 'n',
+ "\xC5\x89" => 'n',
+ "\xC5\x8A" => 'N',
+ "\xC5\x8B" => 'n',
+ "\xC5\x8C" => 'O',
+ "\xC5\x8D" => 'o',
+ "\xC5\x8E" => 'O',
+ "\xC5\x8F" => 'o',
+ "\xC5\x90" => 'O',
+ "\xC5\x91" => 'o',
+ "\xC5\x92" => 'OE',
+ "\xC5\x93" => 'oe',
+ "\xC5\x94" => 'R',
+ "\xC5\x95" => 'r',
+ "\xC5\x96" => 'R',
+ "\xC5\x97" => 'r',
+ "\xC5\x98" => 'R',
+ "\xC5\x99" => 'r',
+ "\xC5\x9A" => 'S',
+ "\xC5\x9B" => 's',
+ "\xC5\x9C" => 'S',
+ "\xC5\x9D" => 's',
+ "\xC5\x9E" => 'S',
+ "\xC5\x9F" => 's',
+ "\xC5\xA0" => 'S',
+ "\xC5\xA1" => 's',
+ "\xC5\xA2" => 'T',
+ "\xC5\xA3" => 't',
+ "\xC5\xA4" => 'T',
+ "\xC5\xA5" => 't',
+ "\xC5\xA6" => 'T',
+ "\xC5\xA7" => 't',
+ "\xC5\xA8" => 'U',
+ "\xC5\xA9" => 'u',
+ "\xC5\xAA" => 'U',
+ "\xC5\xAB" => 'u',
+ "\xC5\xAC" => 'U',
+ "\xC5\xAD" => 'u',
+ "\xC5\xAE" => 'U',
+ "\xC5\xAF" => 'u',
+ "\xC5\xB0" => 'U',
+ "\xC5\xB1" => 'u',
+ "\xC5\xB2" => 'U',
+ "\xC5\xB3" => 'u',
+ "\xC5\xB4" => 'W',
+ "\xC5\xB5" => 'w',
+ "\xC5\xB6" => 'Y',
+ "\xC5\xB7" => 'y',
+ "\xC5\xB8" => 'Y',
+ "\xC5\xB9" => 'Z',
+ "\xC5\xBA" => 'z',
+ "\xC5\xBB" => 'Z',
+ "\xC5\xBC" => 'z',
+ "\xC5\xBD" => 'Z',
+ "\xC5\xBE" => 'z',
+ "\xC5\xBF" => 's',
+ "\xC8\x98" => 'S',
+ "\xC8\x99" => 's',
+ "\xC8\x9A" => 'T',
+ "\xC8\x9B" => 't',
+);

File Metadata

Mime Type
text/plain
Expires
Aug 16 2025, 7:12 PM (10 w, 6 d ago)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
8895720
Default Alt Text
D21129.id50314.diff (9 KB)

Event Timeline