D16440.id.diff
View Options

	diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php
	--- a/src/__phutil_library_map__.php
	+++ b/src/__phutil_library_map__.php
	@@ -506,6 +506,7 @@
	'phutil_units' => 'utils/utils.php',
	'phutil_utf8_console_strlen' => 'utils/utf8.php',
	'phutil_utf8_convert' => 'utils/utf8.php',
	+ 'phutil_utf8_encode_codepoint' => 'utils/utf8.php',
	'phutil_utf8_hard_wrap' => 'utils/utf8.php',
	'phutil_utf8_hard_wrap_html' => 'utils/utf8.php',
	'phutil_utf8_is_combining_character' => 'utils/utf8.php',
	diff --git a/src/utils/__tests__/PhutilUTF8TestCase.php b/src/utils/__tests__/PhutilUTF8TestCase.php
	--- a/src/utils/__tests__/PhutilUTF8TestCase.php
	+++ b/src/utils/__tests__/PhutilUTF8TestCase.php
	@@ -42,6 +42,68 @@
	$this->assertEqual($expect, $result);
	}

	+ public function testOverlongFormFiltering() {
	+ $bad = "\xEF\xBF\xBD";
	+
	+ $map = array(
	+ 'quack' => 'quack',
	+
	+ // This is U+1000, a valid character.
	+ "\xE1\x80\x80" => "\xE1\x80\x80",
	+
	+ // This is a 2-byte encoding of U+0000.
	+ "\xC0\x80" => "{$bad}{$bad}",
	+
	+ // This is a 3-byte encoding of U+0020.
	+ "\xE0\x80\xA0" => "{$bad}{$bad}{$bad}",
	+
	+ "A \xE0\x83\x83" => "A {$bad}{$bad}{$bad}",
	+ );
	+
	+ foreach ($map as $input => $expect) {
	+ $actual = phutil_utf8ize($input);
	+ $this->assertEqual(
	+ $expect,
	+ $actual,
	+ pht('Overlong form canonicalization of: %s', $input));
	+ }
	+ }
	+
	+ public function testSurrogateFiltering() {
	+ $bad = "\xEF\xBF\xBD";
	+
	+ $map = array(
	+ "A \xED\xA9\x98" => "A {$bad}{$bad}{$bad}",
	+ );
	+
	+ foreach ($map as $input => $expect) {
	+ $actual = phutil_utf8ize($input);
	+ $this->assertEqual(
	+ $expect,
	+ $actual,
	+ pht('Surrogate filtering: %s', $input));
	+ }
	+ }
	+
	+
	+ public function testUTF8CodepointEncoding() {
	+ $map = array(
	+ 0x20 => ' ',
	+ 0x7E => '~',
	+ 0xE9 => "\xC3\xA9",
	+ 0x2603 => "\xE2\x98\x83",
	+ 0x1F417 => "\xF0\x9F\x90\x97",
	+ );
	+
	+ foreach ($map as $input => $expect) {
	+ $actual = phutil_utf8_encode_codepoint($input);
	+ $this->assertEqual(
	+ $expect,
	+ $actual,
	+ pht('UTF8 codepoint encoding of "%s".', $input));
	+ }
	+ }
	+
	public function testUTF8len() {
	$strings = array(
	'' => 0,
	diff --git a/src/utils/utf8.php b/src/utils/utf8.php
	--- a/src/utils/utf8.php
	+++ b/src/utils/utf8.php
	@@ -6,6 +6,8 @@
	* When invalid byte subsequences are encountered, they will be replaced with
	* U+FFFD, the Unicode replacement character.
	*
	+ * This function treats overlong encodings as invalid.
	+ *
	* @param string String to convert to valid UTF-8.
	* @return string String with invalid UTF-8 byte subsequences replaced with
	* U+FFFD.
	@@ -21,27 +23,48 @@
	// TODO: Provide an optional fast C implementation ala fb_utf8ize() if this
	// ever shows up in profiles?

	- // NOTE: Overlong 3-byte and 4-byte representations incorrectly survive
	- // this function.
	-
	$result = array();

	$regex =
	"/([\x01-\x7F]".
	- "\|[\xC2-\xDF][\x80-\xBF]".
	- "\|[\xE0-\xEF][\x80-\xBF][\x80-\xBF]".
	- "\|[\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF])".
	+ "\|[\xC2-\xDF][\x80-\xBF]".
	+ "\|[\xE0][\xA0-\xBF][\x80-\xBF]".
	+ "\|[\xE1-\xEF][\x80-\xBF][\x80-\xBF]".
	+ "\|[\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF]".
	+ "\|[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]".
	+ "\|[\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF])".
	"\|(.)/";

	+ $replacement = "\xEF\xBF\xBD";
	+
	$offset = 0;
	$matches = null;
	while (preg_match($regex, $string, $matches, 0, $offset)) {
	if (!isset($matches[2])) {
	- $result[] = $matches[1];
	+ $match = $matches[1];
	+
	+ if ($match[0] == "\xED") {
	+ // If this is a 3-byte character that may be part of one of the
	+ // surrogate ranges, check if it's actually in those ranges. Reject
	+ // it as invalid if it is. These sequences are used in UTF16 and
	+ // functions like json_encode() refuse to encode them.
	+
	+ $codepoint = ((ord($match[0]) & 0x0F) << 12)
	+ + ((ord($match[1]) & 0x3F) << 6)
	+ + ((ord($match[2]) & 0x3F));
	+ if ($codepoint >= 0xD800 && $codepoint <= 0xDFFF) {
	+ $result[] = str_repeat($replacement, strlen($match));
	+ $offset += strlen($matches[0]);
	+ continue;
	+ }
	+ }
	+
	+ $result[] = $match;
	} else {
	// Unicode replacement character, U+FFFD.
	- $result[] = "\xEF\xBF\xBD";
	+ $result[] = $replacement;
	}
	+
	$offset += strlen($matches[0]);
	}

	@@ -415,6 +438,38 @@


	/**
	+ * Convert a Unicode codepoint into a UTF8-encoded string.
	+ *
	+ * @param int Unicode codepoint.
	+ * @return string UTF8 encoding.
	+ */
	+function phutil_utf8_encode_codepoint($codepoint) {
	+ if ($codepoint < 0x80) {
	+ $r = chr($codepoint);
	+ } else if ($codepoint < 0x800) {
	+ $r = chr(0xC0 \| (($codepoint >> 6) & 0x1F)).
	+ chr(0x80 \| (($codepoint) & 0x3F));
	+ } else if ($codepoint < 0x10000) {
	+ $r = chr(0xE0 \| (($codepoint >> 12) & 0x0F)).
	+ chr(0x80 \| (($codepoint >> 6) & 0x3F)).
	+ chr(0x80 \| (($codepoint) & 0x3F));
	+ } else if ($codepoint < 0x110000) {
	+ $r = chr(0xF0 \| (($codepoint >> 18) & 0x07)).
	+ chr(0x80 \| (($codepoint >> 12) & 0x3F)).
	+ chr(0x80 \| (($codepoint >> 6) & 0x3F)).
	+ chr(0x80 \| (($codepoint) & 0x3F));
	+ } else {
	+ throw new Exception(
	+ pht(
	+ 'Encoding UTF8 codepoint "%s" is not supported.',
	+ $codepoint));
	+ }
	+
	+ return $r;
	+}
	+
	+
	+/**
	* Hard-wrap a block of UTF-8 text with embedded HTML tags and entities.
	*
	* @param string An HTML string with tags and entities.

File Metadata

Mime Type: text/plain
Expires: Mar 17 2025, 2:46 AM (4 w, 5 d ago)
Storage Engine: blob
Storage Format: Encrypted (AES-256-CBC)
Storage Handle: 7706423
Default Alt Text: D16440.id.diff (5 KB)

D16440.id.diff
No OneTemporary
Actions

D16440.id.diff
View Options

File Metadata

Event Timeline

D16440.id.diffNo OneTemporaryActions

D16440.id.diffView Options

File Metadata

Event Timeline

D16440.id.diff
No OneTemporary
Actions

D16440.id.diff
View Options