Page MenuHomePhabricator

D8310.diff
No OneTemporary

D8310.diff

Index: src/__phutil_library_map__.php
===================================================================
--- src/__phutil_library_map__.php
+++ src/__phutil_library_map__.php
@@ -376,6 +376,7 @@
'phutil_implode_html' => 'markup/render.php',
'phutil_is_hiphop_runtime' => 'utils/utils.php',
'phutil_is_utf8' => 'utils/utf8.php',
+ 'phutil_is_utf8_with_only_bmp_characters' => 'utils/utf8.php',
'phutil_is_windows' => 'utils/utils.php',
'phutil_loggable_string' => 'utils/utils.php',
'phutil_passthru' => 'future/exec/execx.php',
Index: src/utils/__tests__/PhutilUTF8TestCase.php
===================================================================
--- src/utils/__tests__/PhutilUTF8TestCase.php
+++ src/utils/__tests__/PhutilUTF8TestCase.php
@@ -430,4 +430,40 @@
}
+ public function testUTF8BMP() {
+ $tests = array(
+ "" => array(true, true, "empty string"),
+ "a" => array(true, true, "a"),
+ "a\xCD\xA0\xCD\xA0" => array(true, true, "a with combining"),
+ "\xE2\x98\x83" => array(true, true, "snowman"),
+
+ // This is the last character in BMP, U+FFFF.
+ "\xEF\xBF\xBF" => array(true, true, "U+FFFF"),
+
+ // This isn't valid.
+ "\xEF\xBF\xC0" => array(false, false, "Invalid, byte range."),
+
+ // This is the first character above BMP, U+10000.
+ "\xF0\x90\x80\x80" => array(true, false, "U+10000"),
+ "\xF0\x9D\x84\x9E" => array(true, false, "gclef"),
+
+ "musical \xF0\x9D\x84\x9E g-clef" => array(true, false, "gclef text"),
+ "\xF0\x9D\x84" => array(false, false, "Invalid, truncated."),
+ );
+
+ foreach ($tests as $input => $test) {
+ list($expect_utf8, $expect_bmp, $test_name) = $test;
+
+ $this->assertEqual(
+ $expect_utf8,
+ phutil_is_utf8($input),
+ pht('is_utf(%s)', $test_name));
+
+ $this->assertEqual(
+ $expect_bmp,
+ phutil_is_utf8_with_only_bmp_characters($input),
+ pht('is_utf_bmp(%s)', $test_name));
+ }
+ }
+
}
Index: src/utils/utf8.php
===================================================================
--- src/utils/utf8.php
+++ src/utils/utf8.php
@@ -48,6 +48,27 @@
/**
+ * Determine if a string is valid UTF-8, with only basic multilingual plane
+ * characters. This is particularly important because MySQL's `utf8` column
+ * types silently truncate strings which contain characters outside of this
+ * set.
+ *
+ * @param string String to test for being valid UTF-8 with only characters in
+ * the basic multilingual plane.
+ * @return bool True if the string is valid UTF-8 with only BMP characters.
+ */
+function phutil_is_utf8_with_only_bmp_characters($string) {
+ $regex =
+ "/^(".
+ "[\x01-\x7F]+".
+ "|([\xC2-\xDF][\x80-\xBF])".
+ "|([\xE0-\xEF][\x80-\xBF][\x80-\xBF]))*\$/";
+
+ return (bool)preg_match($regex, $string);
+}
+
+
+/**
* Determine if a string is valid UTF-8.
*
* @param string Some string which may or may not be valid UTF-8.

File Metadata

Mime Type
text/plain
Expires
Sun, Dec 1, 6:00 PM (20 h, 32 m)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
6813407
Default Alt Text
D8310.diff (2 KB)

Event Timeline