diff --git a/src/utils/__tests__/PhutilUTF8TestCase.php b/src/utils/__tests__/PhutilUTF8TestCase.php --- a/src/utils/__tests__/PhutilUTF8TestCase.php +++ b/src/utils/__tests__/PhutilUTF8TestCase.php @@ -456,6 +456,8 @@ "musical \xF0\x9D\x84\x9E g-clef" => array(true, false, "gclef text"), "\xF0\x9D\x84" => array(false, false, "Invalid, truncated."), + + "\xE0\x80\x80" => array(false, false, "Nonminimal 3-byte character."), ); foreach ($tests as $input => $test) { diff --git a/src/utils/utf8.php b/src/utils/utf8.php --- a/src/utils/utf8.php +++ b/src/utils/utf8.php @@ -77,7 +77,7 @@ continue; } return false; - } else if ($chr >= 0xE0 && $chr <= 0xEF) { + } else if ($chr > 0xE0 && $chr <= 0xEF) { $chr = ord($string[++$ii]); if ($chr >= 0x80 && $chr <= 0xBF) { $chr = ord($string[++$ii]); @@ -86,6 +86,20 @@ } } return false; + } else if ($chr == 0xE0) { + $chr = ord($string[++$ii]); + + // NOTE: This range starts at 0xA0, not 0x80. The values 0x80-0xA0 are + // "valid", but not minimal representations, and MySQL rejects them. We're + // special casing this part of the range. + + if ($chr >= 0xA0 && $chr <= 0xBF) { + $chr = ord($string[++$ii]); + if ($chr >= 0x80 && $chr <= 0xBF) { + continue; + } + } + return false; } return false; @@ -109,6 +123,9 @@ return mb_check_encoding($string, 'UTF-8'); } + // NOTE: This incorrectly accepts characters like \xE0\x80\x80, but should + // not. The MB version works correctly. + $regex = "/^(". "[\x01-\x7F]+". @@ -116,7 +133,7 @@ "|([\xE0-\xEF][\x80-\xBF][\x80-\xBF])". "|([\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF]))*\$/"; - return preg_match($regex, $string); + return (bool)preg_match($regex, $string); }