diff --git a/src/difference/ArcanistDiffUtils.php b/src/difference/ArcanistDiffUtils.php --- a/src/difference/ArcanistDiffUtils.php +++ b/src/difference/ArcanistDiffUtils.php @@ -56,7 +56,30 @@ ); } - return self::computeIntralineEdits($o, $n); + // Do a fast check for certainly-too-long inputs before splitting the + // lines. Inputs take ~200x more memory to represent as lists than as + // strings, so we can run out of memory quickly if we try to split huge + // inputs. See T11744. + $ol = strlen($o); + $nl = strlen($n); + + $max_glyphs = 80; + + // This has some wiggle room for multi-byte UTF8 characters, and the + // fact that we're testing the sum of the lengths of both strings. It can + // still generate false positives for, say, Chinese text liberally + // slathered with combining characters, but this kind of text should be + // vitually nonexistent in real data. + $too_many_bytes = (16 * $max_glyphs); + + if ($ol + $nl > $too_many_bytes) { + return array( + array(array(1, $ol)), + array(array(1, $nl)), + ); + } + + return self::computeIntralineEdits($o, $n, $max_glyphs); } public static function applyIntralineDiff($str, $intra_stack) { @@ -155,7 +178,7 @@ ->getEditString(); } - public static function computeIntralineEdits($o, $n) { + private static function computeIntralineEdits($o, $n, $max_glyphs) { if (preg_match('/[\x80-\xFF]/', $o.$n)) { $ov = phutil_utf8v_combined($o); $nv = phutil_utf8v_combined($n); @@ -166,7 +189,7 @@ $multibyte = false; } - $result = self::generateEditString($ov, $nv); + $result = self::generateEditString($ov, $nv, $max_glyphs); // Now we have a character-based description of the edit. We need to // convert into a byte-based description. Walk through the edit string and