Differential D9368 Diff 23140 src/utils/PhutilUTF8StringTruncator.php

Changeset View

Standalone View

src/utils/PhutilUTF8StringTruncator.php

This file was added.

				<?php

				/**
				* Truncate a UTF-8 string to a some maximum number of bytes, codepoints, or
				* glyphs.
				*
				* This class takes some UTF-8 string as input, and emits a valid UTF-8 string
				* which is no longer than given byte, codepoint, or glyph limits.
				*
				* $short = id(new PhutilUTF8StringTruncator())
				* ->setMaximumGlyphs(80)
				* ->truncateString($long);
				*
				* Byte limits restrict the number of bytes the result may contain. They are
				* appropriate when you care about how much storage a string requires.
				*
				* Codepoint limits restrict the number of codepoints the result may contain.
				* Since codepoints may have up to 4 bytes, the resulting strings may require
				* have more than this many bytes. This kind of limit is appropriate when you
				* are using UTF-8 storage, like MySQL.
				*
				* Glyph limits restrict the display size of the string. Because a single glyph
				* may have an arbitrary number of combining characters, this does not impose
				* a storage size limit on the string: a string with only one glyph may require
				* an arbitrarily large number of bytes.
				*
				* You can set more than one limit; the smallest limit will be used.
				*
				* NOTE: This function makes a best effort to apply some reasonable rules but
				* will not work well for the full range of unicode languages.
				*
				* @group utf8
				*/
				final class PhutilUTF8StringTruncator extends Phobject {

				private $maximumBytes;
				private $maximumCodepoints;
				private $maximumGlyphs;
				private $minimumLimit;

				private $terminator = "\xE2\x80\xA6";
				private $terminatorBytes = 3;
				private $terminatorCodepoints = 1;
				private $terminatorGlyphs = 1;

				public function setMaximumBytes($maximum_bytes) {
				$this->maximumBytes = $maximum_bytes;
				$this->didUpdateMaxima();
				return $this;
				}

				public function setMaximumCodepoints($maximum_codepoints) {
				$this->maximumCodepoints = $maximum_codepoints;
				$this->didUpdateMaxima();
				return $this;
				}

				public function setMaximumGlyphs($maximum_glyphs) {
				$this->maximumGlyphs = $maximum_glyphs;
				$this->didUpdateMaxima();
				return $this;
				}

				private function didUpdateMaxima() {
				$this->minimumLimit = INF;

				if ($this->maximumBytes) {
				$this->minimumLimit = min($this->minimumLimit, $this->maximumBytes);
				}

				if ($this->maximumCodepoints) {
				$this->minimumLimit = min($this->minimumLimit, $this->maximumCodepoints);
				}

				if ($this->maximumGlyphs) {
				$this->minimumLimit = min($this->minimumLimit, $this->maximumGlyphs);
				}
				}

				public function setTerminator($terminator) {
				$this->terminator = $terminator;
				$this->terminatorBytes = strlen($terminator);
				$this->terminatorCodepoints = count(phutil_utf8v($terminator));
				$this->terminatorGlyphs = count(phutil_utf8v_combined($terminator));
				return $this;
				}

				public function truncateString($string) {
				// First, check if the string has fewer bytes than the most restrictive
				// limit. Codepoints and glyphs always take up at least one byte, so we can
				// just return the string unmodified if we're under all of the limits.
				$byte_len = strlen($string);
				if ($byte_len <= $this->minimumLimit) {
				return $string;
				}

				// If we need the vector of codepoints, build it.
				$string_pv = null;
				if ($this->maximumCodepoints) {
				$string_pv = phutil_utf8v($string);
				$point_len = count($string_pv);
				}

				// We always need the combined vector, even if we're only doing byte or
				// codepoint truncation, because we don't want to truncate to half of a
				// combining character.
				$string_gv = phutil_utf8v_combined($string);
				$glyph_len = count($string_gv);

				// Now, check if we're still over the limits. For example, a string may
				// be over the raw byte limit but under the glyph limit if it contains
				// several multibyte characters.

				$too_long = false;
				if ($this->maximumBytes && ($byte_len > $this->maximumBytes)) {
				$too_long = true;
				}
				if ($this->maximumCodepoints && ($point_len > $this->maximumCodepoints)) {
				$too_long = true;
				}
				if ($this->maximumGlyphs && ($glyph_len > $this->maximumGlyphs)) {
				$too_long = true;
				}

				if (!$too_long) {
				return $string;
				}

				// This string is legitimately longer than at least one of the limits, so
				// we need to truncate it. Find the minimum cutoff point: this is the last
				// glyph we can possibly return while satisfying the limits and having space
				// for the terminator.

				$cutoff = $glyph_len;
				if ($this->maximumBytes) {
				if ($byte_len <= $this->maximumBytes) {
				$cutoff = $glyph_len;
				} else {
				$bytes = $this->terminatorBytes;
				for ($ii = 0; $ii < $glyph_len; $ii++) {
				$bytes += strlen($string_gv[$ii]);
				if ($bytes > $this->maximumBytes) {
				$cutoff = $ii;
				break;
				}
				}
				}
				}

				if ($this->maximumCodepoints) {
				if ($point_len <= $this->maximumCodepoints) {
				$cutoff = min($cutoff, $glyph_len);
				} else {
				$points = 0;
				for ($ii = 0; $ii < $glyph_len; $ii++) {
				$glyph_bytes = strlen($string_gv[$ii]);
				while ($points < $point_len) {
				$glyph_bytes -= strlen($string_pv[$points]);
				$points++;
				if ($glyph_bytes <= 0) {
				break;
				}
				}
				$points_total = $points + $this->terminatorCodepoints;
				if ($points_total > $this->maximumCodepoints) {
				$cutoff = min($cutoff, $ii);
				break;
				}
				}
				}
				}

				if ($this->maximumGlyphs) {
				if ($glyph_len <= $this->maximumGlyphs) {
				$cutoff = min($cutoff, $glyph_len);
				} else {
				$cutoff = min($cutoff, $this->maximumGlyphs - $this->terminatorGlyphs);
				}
				}

				// If we don't have enough characters for anything, just return the
				// terminator.
				if ($cutoff <= 0) {
				return $this->terminator;
				}

				// Otherwise, we're going to try to cut the string off somewhere reasonable
				// rather than somewhere arbitrary.

				// NOTE: This is not complete, and there are many other word boundary
				// characters and reasonable places to break words in the UTF-8 character
				// space. For now, this gives us reasonable behavior for latin langauges. We
				// don't necessarily have access to PCRE+Unicode so there isn't a great way
				// for us to look up character attributes.

				// If we encounter these, prefer to break on them instead of cutting the
				// string off in the middle of a word.
				static $break_characters = array(
				' ' => true,
				"\n" => true,
				';' => true,
				':' => true,
				'[' => true,
				'(' => true,
				',' => true,
				'-' => true,
				);

				// If we encounter these, shorten to this character exactly without
				// appending the terminal.
				static $stop_characters = array(
				'.' => true,
				'!' => true,
				'?' => true,
				);

				// Search backward in the string, looking for reasonable places to break it.
				$word_boundary = null;
				$stop_boundary = null;

				// If we do a word break with a terminal, we have to look beyond at least
				// the number of characters in the terminal. If the terminal is longer than
				// the required length, we'll skip this whole block and return it on its
				// own.

				// Only search backward for a while. At some point we don't get a better
				// result by looking through the whole string, and if this is "MMM..." or
				// a non-latin language without word break characters we're just wasting
				// time.

				$search = max(0, $cutoff - 256);
				for ($ii = min($cutoff, $glyph_len - 1); $ii >= $search; $ii--) {
				$c = $string_gv[$ii];

				if (isset($break_characters[$c])) {
				$word_boundary = $ii;
				} else if (isset($stop_characters[$c])) {
				$stop_boundary = $ii + 1;
				break;
				} else {
				if ($word_boundary !== null) {
				break;
				}
				}
				}

				if ($stop_boundary !== null) {
				// We found a character like ".". Cut the string there, without appending
				// the terminal.
				$string_part = array_slice($string_gv, 0, $stop_boundary);
				return implode('', $string_part);
				}

				// If we didn't find any boundary characters or we found ONLY boundary
				// characters, just break at the maximum character length.
				if ($word_boundary === null \|\| $word_boundary === 0) {
				$word_boundary = $cutoff;
				}

				$string_part = array_slice($string_gv, 0, $word_boundary);
				$string_part = implode('', $string_part);

				return $string_part.$this->terminator;
				}

				}