diff --git a/src/utils/utf8.php b/src/utils/utf8.php index f03663f3..c92ee533 100644 --- a/src/utils/utf8.php +++ b/src/utils/utf8.php @@ -1,982 +1,984 @@ = 0xD800 && $codepoint <= 0xDFFF) { $result[] = str_repeat($replacement, strlen($match)); $offset += strlen($matches[0]); continue; } } $result[] = $match; } else { // Unicode replacement character, U+FFFD. $result[] = $replacement; } $offset += strlen($matches[0]); } return implode('', $result); } /** * Determine if a string is valid UTF-8, with only basic multilingual plane * characters. This is particularly important because MySQL's `utf8` column * types silently truncate strings which contain characters outside of this * set. * * @param string String to test for being valid UTF-8 with only characters in * the basic multilingual plane. * @return bool True if the string is valid UTF-8 with only BMP characters. */ function phutil_is_utf8_with_only_bmp_characters($string) { return phutil_is_utf8_slowly($string, $only_bmp = true); } /** * Determine if a string is valid UTF-8. * * @param string Some string which may or may not be valid UTF-8. * @return bool True if the string is valid UTF-8. */ function phutil_is_utf8($string) { if (function_exists('mb_check_encoding')) { // See T13527. In some versions of PHP, "mb_check_encoding()" strictly // requires a string parameter. $string = phutil_string_cast($string); // If mbstring is available, this is significantly faster than using PHP. return mb_check_encoding($string, 'UTF-8'); } return phutil_is_utf8_slowly($string); } /** * Determine if a string is valid UTF-8, slowly. * * This works on any system, but has very poor performance. * * You should call @{function:phutil_is_utf8} instead of this function, as * that function can use more performant mechanisms if they are available on * the system. * * @param string Some string which may or may not be valid UTF-8. * @param bool True to require all characters be part of the basic * multilingual plane (no more than 3-bytes long). * @return bool True if the string is valid UTF-8. */ function phutil_is_utf8_slowly($string, $only_bmp = false) { // First, check the common case of normal ASCII strings. We're fine if // the string contains no bytes larger than 127. if (preg_match('/^[\x01-\x7F]+\z/', $string)) { return true; } // NOTE: In the past, we used a large regular expression in the form of // '(x|y|z)+' to match UTF8 strings. However, PCRE can segfaults on patterns // like this at relatively small input sizes, at least on some systems // (observed on OSX and Windows). This is apparently because the internal // implementation is recursive and it blows the stack. // See for some discussion. Since the // input limit is extremely low (less than 50KB on my system), do this check // very very slowly in PHP instead. See also T5316. $len = strlen($string); for ($ii = 0; $ii < $len; $ii++) { $chr = ord($string[$ii]); if ($chr >= 0x01 && $chr <= 0x7F) { continue; } else if ($chr >= 0xC2 && $chr <= 0xDF) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); if ($chr >= 0x80 && $chr <= 0xBF) { continue; } return false; } else if ($chr == 0xED) { // See T11525. Some sequences in this block are surrogate codepoints // that are reserved for use in UTF16. We should reject them. $codepoint = ($chr & 0x0F) << 12; ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); $codepoint += ($chr & 0x3F) << 6; if ($chr >= 0x80 && $chr <= 0xBF) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); $codepoint += ($chr & 0x3F); if ($codepoint >= 0xD800 && $codepoint <= 0xDFFF) { // Reject these surrogate codepoints. return false; } if ($chr >= 0x80 && $chr <= 0xBF) { continue; } } return false; } else if ($chr > 0xE0 && $chr <= 0xEF) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); if ($chr >= 0x80 && $chr <= 0xBF) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); if ($chr >= 0x80 && $chr <= 0xBF) { continue; } } return false; } else if ($chr == 0xE0) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); // NOTE: This range starts at 0xA0, not 0x80. The values 0x80-0xA0 are // "valid", but not minimal representations, and MySQL rejects them. We're // special casing this part of the range. if ($chr >= 0xA0 && $chr <= 0xBF) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); if ($chr >= 0x80 && $chr <= 0xBF) { continue; } } return false; } else if (!$only_bmp) { if ($chr > 0xF0 && $chr <= 0xF4) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); if ($chr >= 0x80 && $chr <= 0xBF) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); if ($chr >= 0x80 && $chr <= 0xBF) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); if ($chr >= 0x80 && $chr <= 0xBF) { continue; } } } } else if ($chr == 0xF0) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); // NOTE: As above, this range starts at 0x90, not 0x80. The values // 0x80-0x90 are not minimal representations. if ($chr >= 0x90 && $chr <= 0xBF) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); if ($chr >= 0x80 && $chr <= 0xBF) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); if ($chr >= 0x80 && $chr <= 0xBF) { continue; } } } } } return false; } return true; } /** * Find the character length of a UTF-8 string. * * @param string A valid utf-8 string. * @return int The character length of the string. */ function phutil_utf8_strlen($string) { if (function_exists('utf8_decode')) { return strlen(utf8_decode($string)); } return count(phutil_utf8v($string)); } /** * Find the console display length of a UTF-8 string. This may differ from the * character length of the string if it contains double-width characters, like * many Chinese characters. * * This method is based on a C implementation here, which is based on the IEEE * standards. The source has more discussion and addresses more considerations * than this implementation does. * * http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c * * NOTE: We currently assume width 1 for East-Asian ambiguous characters. * * NOTE: This function is VERY slow. * * @param string A valid UTF-8 string. * @return int The console display length of the string. */ function phutil_utf8_console_strlen($string) { // Formatting and colors don't contribute any width in the console. $string = preg_replace("/\x1B\[\d*m/", '', $string); // In the common case of an ASCII string, just return the string length. if (preg_match('/^[\x01-\x7F]*\z/', $string)) { return strlen($string); } $len = 0; // NOTE: To deal with combining characters, we're splitting the string into // glyphs first (characters with combiners) and then counting just the width // of the first character in each glyph. $display_glyphs = phutil_utf8v_combined($string); foreach ($display_glyphs as $display_glyph) { $glyph_codepoints = phutil_utf8v_codepoints($display_glyph); foreach ($glyph_codepoints as $c) { if ($c == 0) { break; } $len += 1 + ($c >= 0x1100 && ($c <= 0x115F || /* Hangul Jamo init. consonants */ $c == 0x2329 || $c == 0x232A || ($c >= 0x2E80 && $c <= 0xA4CF && $c != 0x303F) || /* CJK ... Yi */ ($c >= 0xAC00 && $c <= 0xD7A3) || /* Hangul Syllables */ ($c >= 0xF900 && $c <= 0xFAFF) || /* CJK Compatibility Ideographs */ ($c >= 0xFE10 && $c <= 0xFE19) || /* Vertical forms */ ($c >= 0xFE30 && $c <= 0xFE6F) || /* CJK Compatibility Forms */ ($c >= 0xFF00 && $c <= 0xFF60) || /* Fullwidth Forms */ ($c >= 0xFFE0 && $c <= 0xFFE6) || ($c >= 0x20000 && $c <= 0x2FFFD) || ($c >= 0x30000 && $c <= 0x3FFFD))); break; } } return $len; } /** * Test if a string contains Chinese, Japanese, or Korean characters. * * Most languages use spaces to separate words, but these languages do not. * * @param string String to examine, in UTF8. * @return bool True if the string contains Chinese, Japanese, or Korean * characters. */ function phutil_utf8_is_cjk($string) { $codepoints = phutil_utf8v_codepoints($string); foreach ($codepoints as $codepoint) { // CJK Unified Ideographs if ($codepoint >= 0x4E00 && $codepoint <= 0x9FFF) { return true; } // CJK Unified Ideographs Extension A if ($codepoint >= 0x3400 && $codepoint <= 0x4DBF) { return true; } // CJK Unified Ideographs Extension B if ($codepoint >= 0x20000 && $codepoint <= 0x2A6DF) { return true; } // CJK Unified Ideographs Extension C if ($codepoint >= 0x2A700 && $codepoint <= 0x2B73F) { return true; } // CJK Unified Ideographs Extension D if ($codepoint >= 0x2B740 && $codepoint <= 0x2B81F) { return true; } // CJK Unified Ideographs Extension E if ($codepoint >= 0x2B820 && $codepoint <= 0x2CEAF) { return true; } // CJK Unified Ideographs Extension F if ($codepoint >= 0x2CEB0 && $codepoint <= 0x2EBEF) { return true; } // CJK Compatibility Ideographs if ($codepoint >= 0xF900 && $codepoint <= 0xFAFF) { return true; } } return false; } /** * Split a UTF-8 string into an array of characters. Combining characters are * also split. * * @param string A valid utf-8 string. * @param int|null Stop processing after examining this many bytes. * @return list A list of characters in the string. */ function phutil_utf8v($string, $byte_limit = null) { + $string = phutil_string_cast($string); + $res = array(); $len = strlen($string); $ii = 0; while ($ii < $len) { $byte = $string[$ii]; if ($byte <= "\x7F") { $res[] = $byte; $ii += 1; if ($byte_limit && ($ii >= $byte_limit)) { break; } continue; } else if ($byte < "\xC0") { throw new Exception( pht('Invalid UTF-8 string passed to %s.', __FUNCTION__)); } else if ($byte <= "\xDF") { $seq_len = 2; } else if ($byte <= "\xEF") { $seq_len = 3; } else if ($byte <= "\xF7") { $seq_len = 4; } else if ($byte <= "\xFB") { $seq_len = 5; } else if ($byte <= "\xFD") { $seq_len = 6; } else { throw new Exception( pht('Invalid UTF-8 string passed to %s.', __FUNCTION__)); } if ($ii + $seq_len > $len) { throw new Exception( pht('Invalid UTF-8 string passed to %s.', __FUNCTION__)); } for ($jj = 1; $jj < $seq_len; ++$jj) { if ($string[$ii + $jj] >= "\xC0") { throw new Exception( pht('Invalid UTF-8 string passed to %s.', __FUNCTION__)); } } $res[] = substr($string, $ii, $seq_len); $ii += $seq_len; if ($byte_limit && ($ii >= $byte_limit)) { break; } } return $res; } /** * Split a UTF-8 string into an array of codepoints (as integers). * * @param string A valid UTF-8 string. * @return list A list of codepoints, as integers. */ function phutil_utf8v_codepoints($string) { $str_v = phutil_utf8v($string); foreach ($str_v as $key => $char) { $c = ord($char[0]); $v = 0; if (($c & 0x80) == 0) { $v = $c; } else if (($c & 0xE0) == 0xC0) { $v = (($c & 0x1F) << 6) + ((ord($char[1]) & 0x3F)); } else if (($c & 0xF0) == 0xE0) { $v = (($c & 0x0F) << 12) + ((ord($char[1]) & 0x3F) << 6) + ((ord($char[2]) & 0x3F)); } else if (($c & 0xF8) == 0xF0) { $v = (($c & 0x07) << 18) + ((ord($char[1]) & 0x3F) << 12) + ((ord($char[2]) & 0x3F) << 6) + ((ord($char[3]) & 0x3F)); } else if (($c & 0xFC) == 0xF8) { $v = (($c & 0x03) << 24) + ((ord($char[1]) & 0x3F) << 18) + ((ord($char[2]) & 0x3F) << 12) + ((ord($char[3]) & 0x3F) << 6) + ((ord($char[4]) & 0x3F)); } else if (($c & 0xFE) == 0xFC) { $v = (($c & 0x01) << 30) + ((ord($char[1]) & 0x3F) << 24) + ((ord($char[2]) & 0x3F) << 18) + ((ord($char[3]) & 0x3F) << 12) + ((ord($char[4]) & 0x3F) << 6) + ((ord($char[5]) & 0x3F)); } $str_v[$key] = $v; } return $str_v; } /** * Convert a Unicode codepoint into a UTF8-encoded string. * * @param int Unicode codepoint. * @return string UTF8 encoding. */ function phutil_utf8_encode_codepoint($codepoint) { if ($codepoint < 0x80) { $r = chr($codepoint); } else if ($codepoint < 0x800) { $r = chr(0xC0 | (($codepoint >> 6) & 0x1F)). chr(0x80 | (($codepoint) & 0x3F)); } else if ($codepoint < 0x10000) { $r = chr(0xE0 | (($codepoint >> 12) & 0x0F)). chr(0x80 | (($codepoint >> 6) & 0x3F)). chr(0x80 | (($codepoint) & 0x3F)); } else if ($codepoint < 0x110000) { $r = chr(0xF0 | (($codepoint >> 18) & 0x07)). chr(0x80 | (($codepoint >> 12) & 0x3F)). chr(0x80 | (($codepoint >> 6) & 0x3F)). chr(0x80 | (($codepoint) & 0x3F)); } else { throw new Exception( pht( 'Encoding UTF8 codepoint "%s" is not supported.', $codepoint)); } return $r; } /** * Hard-wrap a block of UTF-8 text with embedded HTML tags and entities. * * @param string An HTML string with tags and entities. * @return list List of hard-wrapped lines. */ function phutil_utf8_hard_wrap_html($string, $width) { $break_here = array(); // Convert the UTF-8 string into a list of UTF-8 characters. $vector = phutil_utf8v($string); $len = count($vector); $char_pos = 0; for ($ii = 0; $ii < $len; ++$ii) { // An ampersand indicates an HTML entity; consume the whole thing (until // ";") but treat it all as one character. if ($vector[$ii] == '&') { do { ++$ii; } while ($vector[$ii] != ';'); ++$char_pos; // An "<" indicates an HTML tag, consume the whole thing but don't treat // it as a character. } else if ($vector[$ii] == '<') { do { ++$ii; } while ($vector[$ii] != '>'); } else { ++$char_pos; } // Keep track of where we need to break the string later. if ($char_pos == $width) { $break_here[$ii] = true; $char_pos = 0; } } $result = array(); $string = ''; foreach ($vector as $ii => $char) { $string .= $char; if (isset($break_here[$ii])) { $result[] = $string; $string = ''; } } if (strlen($string)) { $result[] = $string; } return $result; } /** * Hard-wrap a block of UTF-8 text with no embedded HTML tags and entities. * * @param string A non HTML string * @param int Width of the hard-wrapped lines * @return list List of hard-wrapped lines. */ function phutil_utf8_hard_wrap($string, $width) { $result = array(); $lines = phutil_split_lines($string, $retain_endings = false); foreach ($lines as $line) { // Convert the UTF-8 string into a list of UTF-8 characters. $vector = phutil_utf8v($line); $len = count($vector); $buffer = ''; for ($ii = 1; $ii <= $len; ++$ii) { $buffer .= $vector[$ii - 1]; if (($ii % $width) === 0) { $result[] = $buffer; $buffer = ''; } } if (strlen($buffer)) { $result[] = $buffer; } } return $result; } /** * Convert a string from one encoding (like ISO-8859-1) to another encoding * (like UTF-8). * * This is primarily a thin wrapper around `mb_convert_encoding()` which checks * you have the extension installed, since we try to require the extension * only if you actually need it (i.e., you want to work with encodings other * than UTF-8). * * NOTE: This function assumes that the input is in the given source encoding. * If it is not, it may not output in the specified target encoding. If you * need to perform a hard conversion to UTF-8, use this function in conjunction * with @{function:phutil_utf8ize}. We can detect failures caused by invalid * encoding names, but `mb_convert_encoding()` fails silently if the * encoding name identifies a real encoding but the string is not actually * encoded with that encoding. * * @param string String to re-encode. * @param string Target encoding name, like "UTF-8". * @param string Source encoding name, like "ISO-8859-1". * @return string Input string, with converted character encoding. * * @phutil-external-symbol function mb_convert_encoding */ function phutil_utf8_convert($string, $to_encoding, $from_encoding) { if (!$from_encoding) { throw new InvalidArgumentException( pht( 'Attempting to convert a string encoding, but no source encoding '. 'was provided. Explicitly provide the source encoding.')); } if (!$to_encoding) { throw new InvalidArgumentException( pht( 'Attempting to convert a string encoding, but no target encoding '. 'was provided. Explicitly provide the target encoding.')); } // Normalize encoding names so we can no-op the very common case of UTF8 // to UTF8 (or any other conversion where both encodings are identical). $to_upper = strtoupper(str_replace('-', '', $to_encoding)); $from_upper = strtoupper(str_replace('-', '', $from_encoding)); if ($from_upper == $to_upper) { return $string; } if (!function_exists('mb_convert_encoding')) { throw new Exception( pht( "Attempting to convert a string encoding from '%s' to '%s', ". "but the '%s' PHP extension is not available. Install %s to ". "work with encodings other than UTF-8.", $from_encoding, $to_encoding, 'mbstring', 'mbstring')); } $result = @mb_convert_encoding($string, $to_encoding, $from_encoding); if ($result === false) { $message = error_get_last(); if ($message) { $message = idx($message, 'message', pht('Unknown error.')); } throw new Exception( pht( "String conversion from encoding '%s' to encoding '%s' failed: %s", $from_encoding, $to_encoding, $message)); } return $result; } /** * Convert a string to title case in a UTF8-aware way. This function doesn't * necessarily do a great job, but the builtin implementation of `ucwords()` can * completely destroy inputs, so it just has to be better than that. Similar to * @{function:ucwords}. * * @param string UTF-8 input string. * @return string Input, in some semblance of title case. */ function phutil_utf8_ucwords($str) { // NOTE: mb_convert_case() discards uppercase letters in words when converting // to title case. For example, it will convert "AAA" into "Aaa", which is // undesirable. $v = phutil_utf8v($str); $result = ''; $last = null; $ord_a = ord('a'); $ord_z = ord('z'); foreach ($v as $c) { $convert = false; if ($last === null || $last === ' ') { $o = ord($c[0]); if ($o >= $ord_a && $o <= $ord_z) { $convert = true; } } if ($convert) { $result .= phutil_utf8_strtoupper($c); } else { $result .= $c; } $last = $c; } return $result; } /** * Convert a string to lower case in a UTF8-aware way. Similar to * @{function:strtolower}. * * @param string UTF-8 input string. * @return string Input, in some semblance of lower case. * * @phutil-external-symbol function mb_convert_case */ function phutil_utf8_strtolower($str) { if (function_exists('mb_convert_case')) { return mb_convert_case($str, MB_CASE_LOWER, 'UTF-8'); } static $map; if ($map === null) { $map = array_combine( range('A', 'Z'), range('a', 'z')); } return phutil_utf8_strtr($str, $map); } /** * Convert a string to upper case in a UTF8-aware way. Similar to * @{function:strtoupper}. * * @param string UTF-8 input string. * @return string Input, in some semblance of upper case. * * @phutil-external-symbol function mb_convert_case */ function phutil_utf8_strtoupper($str) { if (function_exists('mb_convert_case')) { return mb_convert_case($str, MB_CASE_UPPER, 'UTF-8'); } static $map; if ($map === null) { $map = array_combine( range('a', 'z'), range('A', 'Z')); } return phutil_utf8_strtr($str, $map); } /** * Replace characters in a string in a UTF-aware way. Similar to * @{function:strtr}. * * @param string UTF-8 input string. * @param map Map of characters to replace. * @return string Input with translated characters. */ function phutil_utf8_strtr($str, array $map) { $v = phutil_utf8v($str); $result = ''; foreach ($v as $c) { if (isset($map[$c])) { $result .= $map[$c]; } else { $result .= $c; } } return $result; } /** * Determine if a given unicode character is a combining character or not. * * @param string A single unicode character. * @return boolean True or false. */ function phutil_utf8_is_combining_character($character) { $components = phutil_utf8v_codepoints($character); // Combining Diacritical Marks (0300 - 036F). // Combining Diacritical Marks Supplement (1DC0 - 1DFF). // Combining Diacritical Marks for Symbols (20D0 - 20FF). // Combining Half Marks (FE20 - FE2F). foreach ($components as $codepoint) { if ($codepoint >= 0x0300 && $codepoint <= 0x036F || $codepoint >= 0x1DC0 && $codepoint <= 0x1DFF || $codepoint >= 0x20D0 && $codepoint <= 0x20FF || $codepoint >= 0xFE20 && $codepoint <= 0xFE2F) { return true; } } return false; } /** * Split a UTF-8 string into an array of characters. Combining characters * are not split. * * @param string A valid utf-8 string. * @return list A list of characters in the string. */ function phutil_utf8v_combined($string) { $components = phutil_utf8v($string); return phutil_utf8v_combine_characters($components); } /** * Merge combining characters in a UTF-8 string. * * This is a low-level method which can allow other operations to do less work. * If you have a string, call @{method:phutil_utf8v_combined} instead. * * @param list List of UTF-8 characters. * @return list List of UTF-8 strings with combining characters merged. */ function phutil_utf8v_combine_characters(array $characters) { if (!$characters) { return array(); } // If the first character in the string is a combining character, // start with a space. if (phutil_utf8_is_combining_character($characters[0])) { $buf = ' '; } else { $buf = null; } $parts = array(); foreach ($characters as $character) { if (!isset($character[1])) { // This an optimization: there are no one-byte combining characters, // so we can just pass these through unmodified. $is_combining = false; } else { $is_combining = phutil_utf8_is_combining_character($character); } if ($is_combining) { $buf .= $character; } else { if ($buf !== null) { $parts[] = $buf; } $buf = $character; } } $parts[] = $buf; return $parts; } /** * Return the current system locale setting (LC_ALL). * * @return string Current system locale setting. */ function phutil_get_system_locale() { $locale = setlocale(LC_ALL, 0); if ($locale === false) { throw new Exception( pht( 'Unable to determine current system locale (call to '. '"setlocale(LC_ALL, 0)" failed).')); } return $locale; } /** * Test if a system locale (LC_ALL) is available on the system. * * @param string Locale name like "en_US.UTF-8". * @return bool True if the locale is available. */ function phutil_is_system_locale_available($locale) { $old_locale = phutil_get_system_locale(); $is_available = @setlocale(LC_ALL, $locale); setlocale(LC_ALL, $old_locale); return ($is_available !== false); } /** * Set the system locale (LC_ALL) to a particular value. * * @param string New locale setting. * @return void */ function phutil_set_system_locale($locale) { $ok = @setlocale(LC_ALL, $locale); if (!$ok) { throw new Exception( pht( 'Failed to set system locale (to "%s").', $locale)); } }