diff --git a/src/utils/PhutilUTF8StringTruncator.php b/src/utils/PhutilUTF8StringTruncator.php index 421e57f..888a7ee 100644 --- a/src/utils/PhutilUTF8StringTruncator.php +++ b/src/utils/PhutilUTF8StringTruncator.php @@ -1,288 +1,295 @@ setMaximumGlyphs(80) * ->truncateString($long); * * Byte limits restrict the number of bytes the result may contain. They are * appropriate when you care about how much storage a string requires. * * Codepoint limits restrict the number of codepoints the result may contain. * Since codepoints may have up to 4 bytes, the resulting strings may require * have more than this many bytes. This kind of limit is appropriate when you * are using UTF-8 storage. * * Glyph limits restrict the display size of the string. Because a single glyph * may have an arbitrary number of combining characters, this does not impose * a storage size limit on the string: a string with only one glyph may require * an arbitrarily large number of bytes. * * You can set more than one limit; the smallest limit will be used. * * NOTE: This function makes a best effort to apply some reasonable rules but * will not work well for the full range of unicode languages. */ final class PhutilUTF8StringTruncator extends Phobject { private $maximumBytes; private $maximumCodepoints; private $maximumGlyphs; private $minimumLimit; private $terminator = "\xE2\x80\xA6"; private $terminatorBytes = 3; private $terminatorCodepoints = 1; private $terminatorGlyphs = 1; public function setMaximumBytes($maximum_bytes) { $this->maximumBytes = $maximum_bytes; $this->didUpdateMaxima(); return $this; } public function setMaximumCodepoints($maximum_codepoints) { $this->maximumCodepoints = $maximum_codepoints; $this->didUpdateMaxima(); return $this; } public function setMaximumGlyphs($maximum_glyphs) { $this->maximumGlyphs = $maximum_glyphs; $this->didUpdateMaxima(); return $this; } private function didUpdateMaxima() { $this->minimumLimit = INF; if ($this->maximumBytes) { $this->minimumLimit = min($this->minimumLimit, $this->maximumBytes); } if ($this->maximumCodepoints) { $this->minimumLimit = min($this->minimumLimit, $this->maximumCodepoints); } if ($this->maximumGlyphs) { $this->minimumLimit = min($this->minimumLimit, $this->maximumGlyphs); } } public function setTerminator($terminator) { $this->terminator = $terminator; $this->terminatorBytes = strlen($terminator); $this->terminatorCodepoints = count(phutil_utf8v($terminator)); $this->terminatorGlyphs = count(phutil_utf8v_combined($terminator)); return $this; } public function truncateString($string) { // First, check if the string has fewer bytes than the most restrictive // limit. Codepoints and glyphs always take up at least one byte, so we can // just return the string unmodified if we're under all of the limits. $byte_len = strlen($string); if ($byte_len <= $this->minimumLimit) { return $string; } // We're going to vectorize the string so we can deal with it in terms // of unicode characters. If the string is huge (like 10MB) and we are // only extracting a tiny piece of it (like the first 1024 bytes), we // want to avoid vectorizing the entire string in cases where there is // no possibility that we'll need all of it. Try to compute a "hard limit": // an upper bound on the number of bytes we can ever need to process. $hard_limits = array(); if ($this->maximumBytes) { $hard_limits[] = $this->maximumBytes; } if ($this->maximumCodepoints) { // No UTF8 character is longer than 6 bytes, so we can impose a ceiling // if we have a codepoint limit. $hard_limits[] = ($this->maximumCodepoints * 6); } if ($hard_limits) { // Add a few more bytes onto the end so that we have a little more of // the string than we actually need and can get the right terminator // behavior. $hard_limit = max($hard_limits) + 32; } else { $hard_limit = null; } // Build a vector of characters first. $string_pv = phutil_utf8v($string, $hard_limit); $point_len = count($string_pv); // We always need the combined vector, even if we're only doing byte or // codepoint truncation, because we don't want to truncate to half of a // combining character. $string_gv = phutil_utf8v_combine_characters($string_pv); $glyph_len = count($string_gv); // Now, check if we're still over the limits. For example, a string may // be over the raw byte limit but under the glyph limit if it contains // several multibyte characters. $too_long = false; if ($this->maximumBytes && ($byte_len > $this->maximumBytes)) { $too_long = true; } if ($this->maximumCodepoints && ($point_len > $this->maximumCodepoints)) { $too_long = true; } if ($this->maximumGlyphs && ($glyph_len > $this->maximumGlyphs)) { $too_long = true; } if (!$too_long) { return $string; } // This string is legitimately longer than at least one of the limits, so // we need to truncate it. Find the minimum cutoff point: this is the last // glyph we can possibly return while satisfying the limits and having space // for the terminator. $cutoff = $glyph_len; if ($this->maximumBytes) { if ($byte_len <= $this->maximumBytes) { $cutoff = $glyph_len; } else { $bytes = $this->terminatorBytes; for ($ii = 0; $ii < $glyph_len; $ii++) { $bytes += strlen($string_gv[$ii]); if ($bytes > $this->maximumBytes) { $cutoff = $ii; break; } } } } if ($this->maximumCodepoints) { if ($point_len <= $this->maximumCodepoints) { $cutoff = min($cutoff, $glyph_len); } else { $points = 0; for ($ii = 0; $ii < $glyph_len; $ii++) { $glyph_bytes = strlen($string_gv[$ii]); while ($points < $point_len) { $glyph_bytes -= strlen($string_pv[$points]); $points++; if ($glyph_bytes <= 0) { break; } } $points_total = $points + $this->terminatorCodepoints; if ($points_total > $this->maximumCodepoints) { $cutoff = min($cutoff, $ii); break; } } } } if ($this->maximumGlyphs) { if ($glyph_len <= $this->maximumGlyphs) { $cutoff = min($cutoff, $glyph_len); } else { $cutoff = min($cutoff, $this->maximumGlyphs - $this->terminatorGlyphs); } } // If we don't have enough characters for anything, just return the // terminator. if ($cutoff <= 0) { return $this->terminator; } // Otherwise, we're going to try to cut the string off somewhere reasonable // rather than somewhere arbitrary. // NOTE: This is not complete, and there are many other word boundary // characters and reasonable places to break words in the UTF-8 character // space. For now, this gives us reasonable behavior for latin languages. We // don't necessarily have access to PCRE+Unicode so there isn't a great way // for us to look up character attributes. // If we encounter these, prefer to break on them instead of cutting the // string off in the middle of a word. static $break_characters = array( ' ' => true, "\n" => true, ';' => true, ':' => true, '[' => true, '(' => true, ',' => true, '-' => true, ); // If we encounter these, shorten to this character exactly without // appending the terminal. static $stop_characters = array( '.' => true, '!' => true, '?' => true, ); // Search backward in the string, looking for reasonable places to break it. $word_boundary = null; $stop_boundary = null; + $any_nonboundary = false; // If we do a word break with a terminal, we have to look beyond at least // the number of characters in the terminal. If the terminal is longer than // the required length, we'll skip this whole block and return it on its // own. // Only search backward for a while. At some point we don't get a better // result by looking through the whole string, and if this is "MMM..." or // a non-latin language without word break characters we're just wasting // time. - $search = max(0, $cutoff - 256); + // See PHI654. We also only look for a break near the end of the text, + // relative to the length of the text. If the text is something like + // "O123: MMMMMM..." or "See path/to/long/thing", we want to cut the very + // long word in half, not just render "O123..." or "See...". + + $search = max(0, $cutoff - 256, $cutoff / 2); for ($ii = min($cutoff, $glyph_len - 1); $ii >= $search; $ii--) { $c = $string_gv[$ii]; if (isset($break_characters[$c])) { $word_boundary = $ii; } else if (isset($stop_characters[$c])) { $stop_boundary = $ii + 1; break; } else { + $any_nonboundary = true; if ($word_boundary !== null) { break; } } } if ($stop_boundary !== null) { // We found a character like ".". Cut the string there, without appending // the terminal. $string_part = array_slice($string_gv, 0, $stop_boundary); return implode('', $string_part); } // If we didn't find any boundary characters or we found ONLY boundary // characters, just break at the maximum character length. - if ($word_boundary === null || $word_boundary === 0) { + if ($word_boundary === null || !$any_nonboundary) { $word_boundary = $cutoff; } $string_part = array_slice($string_gv, 0, $word_boundary); $string_part = implode('', $string_part); return $string_part.$this->terminator; } } diff --git a/src/utils/__tests__/PhutilUTF8TestCase.php b/src/utils/__tests__/PhutilUTF8TestCase.php index a6e76a2..84c35cb 100644 --- a/src/utils/__tests__/PhutilUTF8TestCase.php +++ b/src/utils/__tests__/PhutilUTF8TestCase.php @@ -1,808 +1,813 @@ assertEqual($input, phutil_utf8ize($input)); } public function testUTF8izeUTF8Ignored() { $input = "\xc3\x9c \xc3\xbc \xe6\x9d\xb1!"; $this->assertEqual($input, phutil_utf8ize($input)); } public function testUTF8izeLongStringNosegfault() { // For some reason my laptop is segfaulting on long inputs inside // preg_match(). Forestall this craziness in the common case, at least. phutil_utf8ize(str_repeat('x', 1024 * 1024)); $this->assertTrue(true); } public function testUTF8izeInvalidUTF8Fixed() { $input = "\xc3 this has \xe6\x9d some invalid utf8 \xe6"; $expect = "\xEF\xBF\xBD this has \xEF\xBF\xBD\xEF\xBF\xBD some invalid utf8 ". "\xEF\xBF\xBD"; $result = phutil_utf8ize($input); $this->assertEqual($expect, $result); } public function testUTF8izeOwlIsCuteAndFerocious() { // This was once a ferocious owl when we used to use "?" as the replacement // character instead of U+FFFD, but now he is sort of not as cute or // ferocious. $input = "M(o\xEE\xFF\xFFo)M"; $expect = "M(o\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBDo)M"; $result = phutil_utf8ize($input); $this->assertEqual($expect, $result); } public function testOverlongFormFiltering() { $bad = "\xEF\xBF\xBD"; $map = array( 'quack' => 'quack', // This is U+1000, a valid character. "\xE1\x80\x80" => "\xE1\x80\x80", // This is a 2-byte encoding of U+0000. "\xC0\x80" => "{$bad}{$bad}", // This is a 3-byte encoding of U+0020. "\xE0\x80\xA0" => "{$bad}{$bad}{$bad}", "A \xE0\x83\x83" => "A {$bad}{$bad}{$bad}", ); foreach ($map as $input => $expect) { $actual = phutil_utf8ize($input); $this->assertEqual( $expect, $actual, pht('Overlong form canonicalization of: %s', $input)); } } public function testSurrogateFiltering() { $bad = "\xEF\xBF\xBD"; $map = array( "A \xED\xA9\x98" => "A {$bad}{$bad}{$bad}", ); foreach ($map as $input => $expect) { $actual = phutil_utf8ize($input); $this->assertEqual( $expect, $actual, pht('Surrogate filtering: %s', $input)); } } public function testUTF8CodepointEncoding() { $map = array( 0x20 => ' ', 0x7E => '~', 0xE9 => "\xC3\xA9", 0x2603 => "\xE2\x98\x83", 0x1F417 => "\xF0\x9F\x90\x97", ); foreach ($map as $input => $expect) { $actual = phutil_utf8_encode_codepoint($input); $this->assertEqual( $expect, $actual, pht('UTF8 codepoint encoding of "%s".', $input)); } } public function testUTF8len() { $strings = array( '' => 0, 'x' => 1, "\xEF\xBF\xBD" => 1, "x\xe6\x9d\xb1y" => 3, 'xyz' => 3, 'quack' => 5, ); foreach ($strings as $str => $expect) { $this->assertEqual($expect, phutil_utf8_strlen($str), 'Length of '.$str); } } public function testUTF8v() { $strings = array( '' => array(), 'x' => array('x'), 'quack' => array('q', 'u', 'a', 'c', 'k'), "x\xe6\x9d\xb1y" => array('x', "\xe6\x9d\xb1", 'y'), // This is a combining character. "x\xCD\xA0y" => array('x', "\xCD\xA0", 'y'), ); foreach ($strings as $str => $expect) { $this->assertEqual($expect, phutil_utf8v($str), 'Vector of '.$str); } } public function testUTF8vCodepoints() { $strings = array( '' => array(), 'x' => array(0x78), 'quack' => array(0x71, 0x75, 0x61, 0x63, 0x6B), "x\xe6\x9d\xb1y" => array(0x78, 0x6771, 0x79), "\xC2\xBB" => array(0x00BB), "\xE2\x98\x83" => array(0x2603), "\xEF\xBF\xBF" => array(0xFFFF), "\xF0\x9F\x92\xA9" => array(0x1F4A9), // This is a combining character. "x\xCD\xA0y" => array(0x78, 0x0360, 0x79), ); foreach ($strings as $str => $expect) { $this->assertEqual( $expect, phutil_utf8v_codepoints($str), pht('Codepoint Vector of %s', $str)); } } public function testUTF8ConsoleStrlen() { $strings = array( '' => 0, "\0" => 0, 'x' => 1, // Double-width chinese character. "\xe6\x9d\xb1" => 2, // Combining character. "x\xCD\xA0y" => 2, // Combining plus double-width. "\xe6\x9d\xb1\xCD\xA0y" => 3, // Colors and formatting. "\x1B[1mx\x1B[m" => 1, "\x1B[1m\x1B[31mx\x1B[m" => 1, ); foreach ($strings as $str => $expect) { $this->assertEqual( $expect, phutil_utf8_console_strlen($str), pht('Console Length of %s', $str)); } } public function testUTF8shorten() { $inputs = array( array('1erp derp derp', 9, '', '1erp derp'), array('2erp derp derp', 12, '...', '2erp derp...'), array('derpxderpxderp', 12, '...', 'derpxderp...'), array("derp\xE2\x99\x83derpderp", 12, '...', "derp\xE2\x99\x83derp..."), array('', 12, '...', ''), array('derp', 12, '...', 'derp'), array('11111', 5, '2222', '11111'), array('111111', 5, '2222', '12222'), array('D1rp. Derp derp.', 7, '...', 'D1rp.'), // "D2rp." is a better shortening of this, but it's dramatically more // complicated to implement with the newer byte/glyph/character // shortening code. array('D2rp. Derp derp.', 5, '...', 'D2...'), array('D3rp. Derp derp.', 4, '...', 'D...'), array('D4rp. Derp derp.', 14, '...', 'D4rp. Derp...'), array('D5rpderp, derp derp', 16, '...', 'D5rpderp...'), array('D6rpderp, derp derp', 17, '...', 'D6rpderp, derp...'), // Strings with combining characters. array("Gr\xCD\xA0mpyCatSmiles", 8, '...', "Gr\xCD\xA0mpy..."), array("X\xCD\xA0\xCD\xA0\xCD\xA0Y", 1, '', "X\xCD\xA0\xCD\xA0\xCD\xA0"), - // This behavior is maybe a little bad, but it seems mostly reasonable, - // at least for latin languages. array( 'Derp, supercalafragalisticexpialadoshus', 30, '...', - 'Derp...', + 'Derp, supercalafragalistice...', ), // If a string has only word-break characters in it, we should just cut // it, not produce only the terminal. array('((((((((((', 8, '...', '(((((...'), // Terminal is longer than requested input. array('derp', 3, 'quack', 'quack'), + + array( + 'O123: com/oracle/java/path/to/application/source/ThingFactory.java', + 32, + '...', + 'O123: com/oracle/java/path/to...', + ), ); foreach ($inputs as $input) { list($string, $length, $terminal, $expect) = $input; $result = id(new PhutilUTF8StringTruncator()) ->setMaximumGlyphs($length) ->setTerminator($terminal) ->truncateString($string); $this->assertEqual($expect, $result, pht('Shortening of %s', $string)); } } public function testUTF8StringTruncator() { $cases = array( array( "o\xCD\xA0o\xCD\xA0o\xCD\xA0o\xCD\xA0o\xCD\xA0", 6, "o\xCD\xA0!", 6, "o\xCD\xA0o\xCD\xA0!", 6, "o\xCD\xA0o\xCD\xA0o\xCD\xA0o\xCD\xA0o\xCD\xA0", ), array( "X\xCD\xA0\xCD\xA0\xCD\xA0Y", 6, '!', 6, "X\xCD\xA0\xCD\xA0\xCD\xA0Y", 6, "X\xCD\xA0\xCD\xA0\xCD\xA0Y", ), array( "X\xCD\xA0\xCD\xA0\xCD\xA0YZ", 6, '!', 5, "X\xCD\xA0\xCD\xA0\xCD\xA0!", 2, "X\xCD\xA0\xCD\xA0\xCD\xA0!", ), array( "\xE2\x98\x83\xE2\x98\x83\xE2\x98\x83\xE2\x98\x83", 4, "\xE2\x98\x83!", 3, "\xE2\x98\x83\xE2\x98\x83!", 3, "\xE2\x98\x83\xE2\x98\x83!", ), ); foreach ($cases as $case) { list($input, $b_len, $b_out, $p_len, $p_out, $g_len, $g_out) = $case; $result = id(new PhutilUTF8StringTruncator()) ->setMaximumBytes($b_len) ->setTerminator('!') ->truncateString($input); $this->assertEqual($b_out, $result, pht('byte-short of %s', $input)); $result = id(new PhutilUTF8StringTruncator()) ->setMaximumCodepoints($p_len) ->setTerminator('!') ->truncateString($input); $this->assertEqual($p_out, $result, pht('codepoint-short of %s', $input)); $result = id(new PhutilUTF8StringTruncator()) ->setMaximumGlyphs($g_len) ->setTerminator('!') ->truncateString($input); $this->assertEqual($g_out, $result, pht('glyph-short of %s', $input)); } } public function testUTF8LargeTruncation() { // This is testing that our performance is reasonable when truncating a // large input into a small output. Runtime should be on the order of the // output size, not the input size. $whale = "\xF0\x9F\x90\xB3"; $input = str_repeat($whale, 1024 * 1024); $result = id(new PhutilUTF8StringTruncator()) ->setMaximumBytes(16) ->setTerminator('!') ->truncateString($input); $this->assertEqual( str_repeat($whale, 3).'!', $result, pht('Large truncation.')); } public function testUTF8Wrap() { $inputs = array( array( 'aaaaaaa', 3, array( 'aaa', 'aaa', 'a', ), ), array( 'aaaaaaa', 3, array( 'aaa', 'aaa', 'a', ), ), array( 'aa&aaaa', 3, array( 'aa&', 'aaa', 'a', ), ), array( "aa\xe6\x9d\xb1aaaa", 3, array( "aa\xe6\x9d\xb1", 'aaa', 'a', ), ), array( '', 80, array( ), ), array( 'a', 80, array( 'a', ), ), ); foreach ($inputs as $input) { list($string, $width, $expect) = $input; $this->assertEqual( $expect, phutil_utf8_hard_wrap_html($string, $width), pht("Wrapping of '%s'.", $string)); } } public function testUTF8NonHTMLWrap() { $inputs = array( array( 'aaaaaaa', 3, array( 'aaa', 'aaa', 'a', ), ), array( 'abracadabra!', 4, array( 'abra', 'cada', 'bra!', ), ), array( '', 10, array( ), ), array( 'a', 20, array( 'a', ), ), array( "aa\xe6\x9d\xb1aaaa", 3, array( "aa\xe6\x9d\xb1", 'aaa', 'a', ), ), array( "mmm\nmmm\nmmmm", 3, array( 'mmm', 'mmm', 'mmm', 'm', ), ), ); foreach ($inputs as $input) { list($string, $width, $expect) = $input; $this->assertEqual( $expect, phutil_utf8_hard_wrap($string, $width), pht("Wrapping of '%s'", $string)); } } public function testUTF8ConvertParams() { $caught = null; try { phutil_utf8_convert('', 'utf8', ''); } catch (Exception $ex) { $caught = $ex; } $this->assertTrue((bool)$caught, pht('Requires source encoding.')); $caught = null; try { phutil_utf8_convert('', '', 'utf8'); } catch (Exception $ex) { $caught = $ex; } $this->assertTrue((bool)$caught, pht('Requires target encoding.')); } public function testUTF8Convert() { if (!function_exists('mb_convert_encoding')) { $this->assertSkipped(pht('Requires %s extension.', 'mbstring')); } // "[ae]gis se[n]or [(c)] 1970 [+/-] 1 [degree]" $input = "\xE6gis SE\xD1OR \xA9 1970 \xB11\xB0"; $expect = "\xC3\xA6gis SE\xC3\x91OR \xC2\xA9 1970 \xC2\xB11\xC2\xB0"; $output = phutil_utf8_convert($input, 'UTF-8', 'ISO-8859-1'); $this->assertEqual($expect, $output, pht('Conversion from ISO-8859-1.')); $caught = null; try { phutil_utf8_convert('xyz', 'moon language', 'UTF-8'); } catch (Exception $ex) { $caught = $ex; } $this->assertTrue((bool)$caught, pht('Conversion with bogus encoding.')); } public function testUTF8ucwords() { $tests = array( '' => '', 'x' => 'X', 'X' => 'X', 'five short graybles' => 'Five Short Graybles', 'xXxSNiPeRKiLLeRxXx' => 'XXxSNiPeRKiLLeRxXx', ); foreach ($tests as $input => $expect) { $this->assertEqual( $expect, phutil_utf8_ucwords($input), 'phutil_utf8_ucwords("'.$input.'")'); } } public function testUTF8strtolower() { $tests = array( '' => '', 'a' => 'a', 'A' => 'a', '!' => '!', 'OMG!~ LOLolol ROFLwaffle11~' => 'omg!~ lololol roflwaffle11~', "\xE2\x98\x83" => "\xE2\x98\x83", ); foreach ($tests as $input => $expect) { $this->assertEqual( $expect, phutil_utf8_strtolower($input), 'phutil_utf8_strtolower("'.$input.'")'); } } public function testUTF8strtoupper() { $tests = array( '' => '', 'a' => 'A', 'A' => 'A', '!' => '!', 'Cats have 9 lives.' => 'CATS HAVE 9 LIVES.', "\xE2\x98\x83" => "\xE2\x98\x83", ); foreach ($tests as $input => $expect) { $this->assertEqual( $expect, phutil_utf8_strtoupper($input), 'phutil_utf8_strtoupper("'.$input.'")'); } } public function testUTF8IsCombiningCharacter() { $character = "\xCD\xA0"; $this->assertEqual( true, phutil_utf8_is_combining_character($character)); $character = 'a'; $this->assertEqual( false, phutil_utf8_is_combining_character($character)); } public function testUTF8vCombined() { // Empty string. $string = ''; $this->assertEqual(array(), phutil_utf8v_combined($string)); // Single character. $string = 'x'; $this->assertEqual(array('x'), phutil_utf8v_combined($string)); // No combining characters. $string = 'cat'; $this->assertEqual(array('c', 'a', 't'), phutil_utf8v_combined($string)); // String with a combining character in the middle. $string = "ca\xCD\xA0t"; $this->assertEqual( array('c', "a\xCD\xA0", 't'), phutil_utf8v_combined($string)); // String starting with a combined character. $string = "c\xCD\xA0at"; $this->assertEqual( array("c\xCD\xA0", 'a', 't'), phutil_utf8v_combined($string)); // String with trailing combining character. $string = "cat\xCD\xA0"; $this->assertEqual( array('c', 'a', "t\xCD\xA0"), phutil_utf8v_combined($string)); // String with muliple combined characters. $string = "c\xCD\xA0a\xCD\xA0t\xCD\xA0"; $this->assertEqual( array("c\xCD\xA0", "a\xCD\xA0", "t\xCD\xA0"), phutil_utf8v_combined($string)); // String with multiple combining characters. $string = "ca\xCD\xA0\xCD\xA0t"; $this->assertEqual( array('c', "a\xCD\xA0\xCD\xA0", 't'), phutil_utf8v_combined($string)); // String beginning with a combining character. $string = "\xCD\xA0\xCD\xA0c"; $this->assertEqual( array(" \xCD\xA0\xCD\xA0", 'c'), phutil_utf8v_combined($string)); } public function testUTF8BMPSegfaults() { // This test case fails by segfaulting, or passes by not segfaulting. See // the function implementation for details. $input = str_repeat("\xEF\xBF\xBF", 1024 * 32); phutil_is_utf8_with_only_bmp_characters($input); $this->assertTrue(true); } public function testCJK() { $map = array( '' => false, 'a' => false, '.' => false, "\xE2\x98\x83" => false, "\xE5\xA0\xB1" => true, ); foreach ($map as $input => $expect) { $actual = phutil_utf8_is_cjk($input); $this->assertEqual($expect, $actual, pht('CJK: "%s"', $input)); } } public function testUTF8BMP() { $tests = array( '' => array( true, true, pht('empty string'), ), 'a' => array( true, true, 'a', ), "a\xCD\xA0\xCD\xA0" => array( true, true, pht('%s with combining', 'a'), ), "\xE2\x98\x83" => array( true, true, pht('snowman'), ), // This is the last character in BMP, U+FFFF. "\xEF\xBF\xBF" => array( true, true, 'U+FFFF', ), // This isn't valid. "\xEF\xBF\xC0" => array( false, false, pht('Invalid, byte range.'), ), // This is an invalid nonminimal representation. "\xF0\x81\x80\x80" => array( false, false, pht('Nonminimal 4-byte character.'), ), // This is the first character above BMP, U+10000. "\xF0\x90\x80\x80" => array( true, false, 'U+10000', ), "\xF0\x9D\x84\x9E" => array( true, false, 'gclef', ), "musical \xF0\x9D\x84\x9E g-clef" => array( true, false, pht('gclef text'), ), "\xF0\x9D\x84" => array( false, false, pht('Invalid, truncated.'), ), "\xE0\x80\x80" => array( false, false, pht('Nonminimal 3-byte character.'), ), // Partial BMP characters. "\xCD" => array( false, false, pht('Partial 2-byte character.'), ), "\xE0\xA0" => array( false, false, pht('Partial BMP 0xE0 character.'), ), "\xE2\x98" => array( false, false, pht('Partial BMP cahracter.'), ), ); foreach ($tests as $input => $test) { list($expect_utf8, $expect_bmp, $test_name) = $test; // Depending on what's installed on the system, this may use an // extension. $this->assertEqual( $expect_utf8, phutil_is_utf8($input), pht('is_utf(%s)', $test_name)); // Also test this against the pure PHP implementation, explicitly. $this->assertEqual( $expect_utf8, phutil_is_utf8_slowly($input), pht('is_utf_slowly(%s)', $test_name)); $this->assertEqual( $expect_bmp, phutil_is_utf8_with_only_bmp_characters($input), pht('is_utf_bmp(%s)', $test_name)); } } public function testSystemLocaleManagement() { $original_locale = phutil_get_system_locale(); $this->assertTrue( (strlen($original_locale) > 0), pht('System has some identifiable locale.')); $this->assertFalse( phutil_is_system_locale_available('duck.quack'), pht('Imaginary locale should be unavailable.')); $this->assertEqual( $original_locale, phutil_get_system_locale(), pht('Testing locale availability should not change the locale.')); $this->assertTrue( phutil_is_system_locale_available($original_locale), pht('The current locale should be available.')); $caught = null; try { phutil_set_system_locale('duck.quack'); } catch (Exception $ex) { $caught = $ex; } $this->assertTrue( ($caught instanceof Exception), pht('Setting an imaginary locale should raise an exception.')); // We need two locales for the next part because one of them might be the // current locale, and we want to make sure we can actually change the // locale value. // If the current locale was "zz_ZZ", and then we do this: // // set_locale("zz_ZZ"); // assert("zz_ZZ" == get_locale()); // // ...the test could pass even if "set_locale(...)" does nothing. $has_us = phutil_is_system_locale_available('en_US.UTF-8'); $has_gb = phutil_is_system_locale_available('en_GB.UTF-8'); if (!$has_us || !$has_gb) { $this->assertSkipped( pht( 'System does not have en_US + en_GB to do locale adjustment '. 'tests.')); } phutil_set_system_locale('en_US.UTF-8'); $this->assertEqual( 'en_US.UTF-8', phutil_get_system_locale(), pht('Set locale to en_US.')); phutil_set_system_locale('en_GB.UTF-8'); $this->assertEqual( 'en_GB.UTF-8', phutil_get_system_locale(), pht('Set locale to en_GB.')); // Put things back the way they were. phutil_set_system_locale($original_locale); } }