Index: src/__phutil_library_map__.php =================================================================== --- src/__phutil_library_map__.php +++ src/__phutil_library_map__.php @@ -376,6 +376,7 @@ 'phutil_implode_html' => 'markup/render.php', 'phutil_is_hiphop_runtime' => 'utils/utils.php', 'phutil_is_utf8' => 'utils/utf8.php', + 'phutil_is_utf8_with_only_bmp_characters' => 'utils/utf8.php', 'phutil_is_windows' => 'utils/utils.php', 'phutil_loggable_string' => 'utils/utils.php', 'phutil_passthru' => 'future/exec/execx.php', Index: src/utils/__tests__/PhutilUTF8TestCase.php =================================================================== --- src/utils/__tests__/PhutilUTF8TestCase.php +++ src/utils/__tests__/PhutilUTF8TestCase.php @@ -430,4 +430,40 @@ } + public function testUTF8BMP() { + $tests = array( + "" => array(true, true, "empty string"), + "a" => array(true, true, "a"), + "a\xCD\xA0\xCD\xA0" => array(true, true, "a with combining"), + "\xE2\x98\x83" => array(true, true, "snowman"), + + // This is the last character in BMP, U+FFFF. + "\xEF\xBF\xBF" => array(true, true, "U+FFFF"), + + // This isn't valid. + "\xEF\xBF\xC0" => array(false, false, "Invalid, byte range."), + + // This is the first character above BMP, U+10000. + "\xF0\x90\x80\x80" => array(true, false, "U+10000"), + "\xF0\x9D\x84\x9E" => array(true, false, "gclef"), + + "musical \xF0\x9D\x84\x9E g-clef" => array(true, false, "gclef text"), + "\xF0\x9D\x84" => array(false, false, "Invalid, truncated."), + ); + + foreach ($tests as $input => $test) { + list($expect_utf8, $expect_bmp, $test_name) = $test; + + $this->assertEqual( + $expect_utf8, + phutil_is_utf8($input), + pht('is_utf(%s)', $test_name)); + + $this->assertEqual( + $expect_bmp, + phutil_is_utf8_with_only_bmp_characters($input), + pht('is_utf_bmp(%s)', $test_name)); + } + } + } Index: src/utils/utf8.php =================================================================== --- src/utils/utf8.php +++ src/utils/utf8.php @@ -48,6 +48,27 @@ /** + * Determine if a string is valid UTF-8, with only basic multilingual plane + * characters. This is particularly important because MySQL's `utf8` column + * types silently truncate strings which contain characters outside of this + * set. + * + * @param string String to test for being valid UTF-8 with only characters in + * the basic multilingual plane. + * @return bool True if the string is valid UTF-8 with only BMP characters. + */ +function phutil_is_utf8_with_only_bmp_characters($string) { + $regex = + "/^(". + "[\x01-\x7F]+". + "|([\xC2-\xDF][\x80-\xBF])". + "|([\xE0-\xEF][\x80-\xBF][\x80-\xBF]))*\$/"; + + return (bool)preg_match($regex, $string); +} + + +/** * Determine if a string is valid UTF-8. * * @param string Some string which may or may not be valid UTF-8.