diff --git a/src/utils/PhutilEditDistanceMatrix.php b/src/utils/PhutilEditDistanceMatrix.php --- a/src/utils/PhutilEditDistanceMatrix.php +++ b/src/utils/PhutilEditDistanceMatrix.php @@ -288,11 +288,15 @@ $str = strrev($str); + // We pad the edit string before smoothing it, so ranges of similar + // characters at the beginning or end of the string can also be smoothed. + $str = $this->padEditString($str); + if ($this->getApplySmoothing()) { $str = $this->applySmoothing($str); } - return $this->padEditString($str); + return $str; } private function padEditString($str) { @@ -512,9 +516,9 @@ // since there are fewer choppy runs of short added and removed substrings. do { $original = $result; - $result = preg_replace('/([xdi])(s{3})([xdi])/', '$1xxx$3', $result); - $result = preg_replace('/([xdi])(s{2})([xdi])/', '$1xx$3', $result); - $result = preg_replace('/([xdi])(s{1})([xdi])/', '$1x$3', $result); + $result = preg_replace('/(^|[xdi])(s{3})([xdi]|\z)/', '$1xxx$3', $result); + $result = preg_replace('/(^|[xdi])(s{2})([xdi]|\z)/', '$1xx$3', $result); + $result = preg_replace('/(^|[xdi])(s{1})([xdi]|\z)/', '$1x$3', $result); } while ($result != $original); return $result; diff --git a/src/utils/PhutilProseDiff.php b/src/utils/PhutilProseDiff.php --- a/src/utils/PhutilProseDiff.php +++ b/src/utils/PhutilProseDiff.php @@ -20,6 +20,7 @@ // Reorder sequences of removed and added sections to put all the "-" // parts together first, then all the "+" parts together. This produces // a more human-readable result than intermingling them. + $o_run = array(); $n_run = array(); $result = array(); @@ -33,24 +34,22 @@ $n_run[] = $part; break; default: - foreach ($o_run as $o) { - $result[] = $o; - } - foreach ($n_run as $n) { - $result[] = $n; + if ($o_run || $n_run) { + foreach ($this->combineRuns($o_run, $n_run) as $merged_part) { + $result[] = $merged_part; + } + $o_run = array(); + $n_run = array(); } $result[] = $part; - $o_run = array(); - $n_run = array(); break; } } - foreach ($o_run as $o) { - $result[] = $o; - } - foreach ($n_run as $n) { - $result[] = $n; + if ($o_run || $n_run) { + foreach ($this->combineRuns($o_run, $n_run) as $part) { + $result[] = $part; + } } // Now, combine consecuitive runs of the same type of change (like a @@ -88,4 +87,109 @@ return $this; } + private function combineRuns($o_run, $n_run) { + $o_merge = $this->mergeParts($o_run); + $n_merge = $this->mergeParts($n_run); + + // When removed and added blocks share a prefix or suffix, we sometimes + // want to count it as unchanged (for example, if it is whitespace) but + // sometimes want to count it as changed (for example, if it is a word + // suffix like "ing"). Find common prefixes and suffixes of these layout + // characters and emit them as "=" (unchanged) blocks. + + $layout_characters = array( + ' ' => true, + "\n" => true, + '.' => true, + '!' => true, + ',' => true, + '?' => true, + ); + + $o_text = $o_merge['text']; + $n_text = $n_merge['text']; + $o_len = strlen($o_text); + $n_len = strlen($n_text); + $min_len = min($o_len, $n_len); + + $prefix_len = 0; + for ($pos = 0; $pos < $min_len; $pos++) { + $o = $o_text[$pos]; + $n = $n_text[$pos]; + if ($o !== $n) { + break; + } + if (empty($layout_characters[$o])) { + break; + } + $prefix_len++; + } + + $suffix_len = 0; + for ($pos = 1; $pos <= $min_len; $pos++) { + $o = $o_text[$o_len - $pos]; + $n = $n_text[$n_len - $pos]; + if ($o !== $n) { + break; + } + if (empty($layout_characters[$o])) { + break; + } + $suffix_len++; + } + + $results = array(); + + if ($prefix_len) { + $results[] = array( + 'type' => '=', + 'text' => substr($o_text, 0, $prefix_len), + ); + } + + if ($prefix_len < $o_len) { + $results[] = array( + 'type' => '-', + 'text' => substr($o_text, $prefix_len, $o_len - $suffix_len), + ); + } + + if ($prefix_len < $n_len) { + $results[] = array( + 'type' => '+', + 'text' => substr($n_text, $prefix_len, $n_len - $suffix_len), + ); + } + + if ($suffix_len) { + $results[] = array( + 'type' => '=', + 'text' => substr($o_text, -$suffix_len), + ); + } + + return $results; + } + + private function mergeParts(array $parts) { + $text = ''; + $type = null; + foreach ($parts as $part) { + $part_type = $part['type']; + if ($type === null) { + $type = $part_type; + } + if ($type !== $part_type) { + throw new Exception(pht('Can not merge parts of dissimilar types!')); + } + $text .= $part['text']; + } + + return array( + 'type' => $type, + 'text' => $text, + ); + } + + } diff --git a/src/utils/__tests__/PhutilProseDiffTestCase.php b/src/utils/__tests__/PhutilProseDiffTestCase.php --- a/src/utils/__tests__/PhutilProseDiffTestCase.php +++ b/src/utils/__tests__/PhutilProseDiffTestCase.php @@ -57,6 +57,35 @@ ), pht('"Says/remarks" word edit smoothenss.')); + $this->assertProseParts( + 'See screenshots', + 'Viewed video files', + array( + '- See screenshots', + '+ Viewed video files', + ), + pht('Complete paragraph rewrite.')); + + $this->assertProseParts( + 'xaaax', + 'xbbbx', + array( + '- xaaax', + '+ xbbbx', + ), + pht('Whole word rewrite with common prefix and suffix.')); + + $this->assertProseParts( + ' aaa ', + ' bbb ', + array( + '= ', + '- aaa', + '+ bbb', + '= ', + ), + pht('Whole word rewrite with whitespace prefix and suffix.')); + } private function assertProseParts($old, $new, array $expect_parts, $label) {