Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F14126873
D8310.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
2 KB
Referenced Files
None
Subscribers
None
D8310.diff
View Options
Index: src/__phutil_library_map__.php
===================================================================
--- src/__phutil_library_map__.php
+++ src/__phutil_library_map__.php
@@ -376,6 +376,7 @@
'phutil_implode_html' => 'markup/render.php',
'phutil_is_hiphop_runtime' => 'utils/utils.php',
'phutil_is_utf8' => 'utils/utf8.php',
+ 'phutil_is_utf8_with_only_bmp_characters' => 'utils/utf8.php',
'phutil_is_windows' => 'utils/utils.php',
'phutil_loggable_string' => 'utils/utils.php',
'phutil_passthru' => 'future/exec/execx.php',
Index: src/utils/__tests__/PhutilUTF8TestCase.php
===================================================================
--- src/utils/__tests__/PhutilUTF8TestCase.php
+++ src/utils/__tests__/PhutilUTF8TestCase.php
@@ -430,4 +430,40 @@
}
+ public function testUTF8BMP() {
+ $tests = array(
+ "" => array(true, true, "empty string"),
+ "a" => array(true, true, "a"),
+ "a\xCD\xA0\xCD\xA0" => array(true, true, "a with combining"),
+ "\xE2\x98\x83" => array(true, true, "snowman"),
+
+ // This is the last character in BMP, U+FFFF.
+ "\xEF\xBF\xBF" => array(true, true, "U+FFFF"),
+
+ // This isn't valid.
+ "\xEF\xBF\xC0" => array(false, false, "Invalid, byte range."),
+
+ // This is the first character above BMP, U+10000.
+ "\xF0\x90\x80\x80" => array(true, false, "U+10000"),
+ "\xF0\x9D\x84\x9E" => array(true, false, "gclef"),
+
+ "musical \xF0\x9D\x84\x9E g-clef" => array(true, false, "gclef text"),
+ "\xF0\x9D\x84" => array(false, false, "Invalid, truncated."),
+ );
+
+ foreach ($tests as $input => $test) {
+ list($expect_utf8, $expect_bmp, $test_name) = $test;
+
+ $this->assertEqual(
+ $expect_utf8,
+ phutil_is_utf8($input),
+ pht('is_utf(%s)', $test_name));
+
+ $this->assertEqual(
+ $expect_bmp,
+ phutil_is_utf8_with_only_bmp_characters($input),
+ pht('is_utf_bmp(%s)', $test_name));
+ }
+ }
+
}
Index: src/utils/utf8.php
===================================================================
--- src/utils/utf8.php
+++ src/utils/utf8.php
@@ -48,6 +48,27 @@
/**
+ * Determine if a string is valid UTF-8, with only basic multilingual plane
+ * characters. This is particularly important because MySQL's `utf8` column
+ * types silently truncate strings which contain characters outside of this
+ * set.
+ *
+ * @param string String to test for being valid UTF-8 with only characters in
+ * the basic multilingual plane.
+ * @return bool True if the string is valid UTF-8 with only BMP characters.
+ */
+function phutil_is_utf8_with_only_bmp_characters($string) {
+ $regex =
+ "/^(".
+ "[\x01-\x7F]+".
+ "|([\xC2-\xDF][\x80-\xBF])".
+ "|([\xE0-\xEF][\x80-\xBF][\x80-\xBF]))*\$/";
+
+ return (bool)preg_match($regex, $string);
+}
+
+
+/**
* Determine if a string is valid UTF-8.
*
* @param string Some string which may or may not be valid UTF-8.
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sun, Dec 1, 6:00 PM (20 h, 32 m)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
6813407
Default Alt Text
D8310.diff (2 KB)
Attached To
Mode
D8310: Add phutil_is_utf8_with_only_bmp_characters()
Attached
Detach File
Event Timeline
Log In to Comment