Page MenuHomePhabricator

D20838.diff
No OneTemporary

D20838.diff

diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php
--- a/src/__phutil_library_map__.php
+++ b/src/__phutil_library_map__.php
@@ -5599,6 +5599,9 @@
'PhutilOAuthAuthAdapter' => 'applications/auth/adapter/PhutilOAuthAuthAdapter.php',
'PhutilPHPCodeSnippetContextFreeGrammar' => 'infrastructure/lipsum/code/PhutilPHPCodeSnippetContextFreeGrammar.php',
'PhutilPhabricatorAuthAdapter' => 'applications/auth/adapter/PhutilPhabricatorAuthAdapter.php',
+ 'PhutilProseDiff' => 'infrastructure/diff/prose/PhutilProseDiff.php',
+ 'PhutilProseDiffTestCase' => 'infrastructure/diff/prose/__tests__/PhutilProseDiffTestCase.php',
+ 'PhutilProseDifferenceEngine' => 'infrastructure/diff/prose/PhutilProseDifferenceEngine.php',
'PhutilQsprintfInterface' => 'infrastructure/storage/xsprintf/PhutilQsprintfInterface.php',
'PhutilQueryString' => 'infrastructure/storage/xsprintf/PhutilQueryString.php',
'PhutilRealNameContextFreeGrammar' => 'infrastructure/lipsum/PhutilRealNameContextFreeGrammar.php',
@@ -12398,6 +12401,9 @@
'PhutilOAuthAuthAdapter' => 'PhutilAuthAdapter',
'PhutilPHPCodeSnippetContextFreeGrammar' => 'PhutilCLikeCodeSnippetContextFreeGrammar',
'PhutilPhabricatorAuthAdapter' => 'PhutilOAuthAuthAdapter',
+ 'PhutilProseDiff' => 'Phobject',
+ 'PhutilProseDiffTestCase' => 'PhutilTestCase',
+ 'PhutilProseDifferenceEngine' => 'Phobject',
'PhutilQueryString' => 'Phobject',
'PhutilRealNameContextFreeGrammar' => 'PhutilContextFreeGrammar',
'PhutilRemarkupAnchorRule' => 'PhutilRemarkupRule',
diff --git a/src/infrastructure/diff/prose/PhutilProseDiff.php b/src/infrastructure/diff/prose/PhutilProseDiff.php
new file mode 100644
--- /dev/null
+++ b/src/infrastructure/diff/prose/PhutilProseDiff.php
@@ -0,0 +1,292 @@
+<?php
+
+final class PhutilProseDiff extends Phobject {
+
+ private $parts = array();
+
+ public function addPart($type, $text) {
+ $this->parts[] = array(
+ 'type' => $type,
+ 'text' => $text,
+ );
+ return $this;
+ }
+
+ public function getParts() {
+ return $this->parts;
+ }
+
+ /**
+ * Get diff parts, but replace large blocks of unchanged text with "."
+ * parts representing missing context.
+ */
+ public function getSummaryParts() {
+ $parts = $this->getParts();
+
+ $head_key = head_key($parts);
+ $last_key = last_key($parts);
+
+ $results = array();
+ foreach ($parts as $key => $part) {
+ $is_head = ($key == $head_key);
+ $is_last = ($key == $last_key);
+
+ switch ($part['type']) {
+ case '=':
+ $pieces = $this->splitTextForSummary($part['text']);
+
+ if ($is_head || $is_last) {
+ $need = 2;
+ } else {
+ $need = 3;
+ }
+
+ // We don't have enough pieces to omit anything, so just continue.
+ if (count($pieces) < $need) {
+ $results[] = $part;
+ break;
+ }
+
+ if (!$is_head) {
+ $results[] = array(
+ 'type' => '=',
+ 'text' => head($pieces),
+ );
+ }
+
+ $results[] = array(
+ 'type' => '.',
+ 'text' => null,
+ );
+
+ if (!$is_last) {
+ $results[] = array(
+ 'type' => '=',
+ 'text' => last($pieces),
+ );
+ }
+ break;
+ default:
+ $results[] = $part;
+ break;
+ }
+ }
+
+ return $results;
+ }
+
+
+ public function reorderParts() {
+ // Reorder sequences of removed and added sections to put all the "-"
+ // parts together first, then all the "+" parts together. This produces
+ // a more human-readable result than intermingling them.
+
+ $o_run = array();
+ $n_run = array();
+ $result = array();
+ foreach ($this->parts as $part) {
+ $type = $part['type'];
+ switch ($type) {
+ case '-':
+ $o_run[] = $part;
+ break;
+ case '+':
+ $n_run[] = $part;
+ break;
+ default:
+ if ($o_run || $n_run) {
+ foreach ($this->combineRuns($o_run, $n_run) as $merged_part) {
+ $result[] = $merged_part;
+ }
+ $o_run = array();
+ $n_run = array();
+ }
+ $result[] = $part;
+ break;
+ }
+ }
+
+ if ($o_run || $n_run) {
+ foreach ($this->combineRuns($o_run, $n_run) as $part) {
+ $result[] = $part;
+ }
+ }
+
+ // Now, combine consecuitive runs of the same type of change (like a
+ // series of "-" parts) into a single run.
+ $combined = array();
+
+ $last = null;
+ $last_text = null;
+ foreach ($result as $part) {
+ $type = $part['type'];
+
+ if ($last !== $type) {
+ if ($last !== null) {
+ $combined[] = array(
+ 'type' => $last,
+ 'text' => $last_text,
+ );
+ }
+ $last_text = null;
+ $last = $type;
+ }
+
+ $last_text .= $part['text'];
+ }
+
+ if ($last_text !== null) {
+ $combined[] = array(
+ 'type' => $last,
+ 'text' => $last_text,
+ );
+ }
+
+ $this->parts = $combined;
+
+ return $this;
+ }
+
+ private function combineRuns($o_run, $n_run) {
+ $o_merge = $this->mergeParts($o_run);
+ $n_merge = $this->mergeParts($n_run);
+
+ // When removed and added blocks share a prefix or suffix, we sometimes
+ // want to count it as unchanged (for example, if it is whitespace) but
+ // sometimes want to count it as changed (for example, if it is a word
+ // suffix like "ing"). Find common prefixes and suffixes of these layout
+ // characters and emit them as "=" (unchanged) blocks.
+
+ $layout_characters = array(
+ ' ' => true,
+ "\n" => true,
+ '.' => true,
+ '!' => true,
+ ',' => true,
+ '?' => true,
+ ']' => true,
+ '[' => true,
+ '(' => true,
+ ')' => true,
+ '<' => true,
+ '>' => true,
+ );
+
+ $o_text = $o_merge['text'];
+ $n_text = $n_merge['text'];
+ $o_len = strlen($o_text);
+ $n_len = strlen($n_text);
+ $min_len = min($o_len, $n_len);
+
+ $prefix_len = 0;
+ for ($pos = 0; $pos < $min_len; $pos++) {
+ $o = $o_text[$pos];
+ $n = $n_text[$pos];
+ if ($o !== $n) {
+ break;
+ }
+ if (empty($layout_characters[$o])) {
+ break;
+ }
+ $prefix_len++;
+ }
+
+ $suffix_len = 0;
+ for ($pos = 0; $pos < ($min_len - $prefix_len); $pos++) {
+ $o = $o_text[$o_len - ($pos + 1)];
+ $n = $n_text[$n_len - ($pos + 1)];
+ if ($o !== $n) {
+ break;
+ }
+ if (empty($layout_characters[$o])) {
+ break;
+ }
+ $suffix_len++;
+ }
+
+ $results = array();
+
+ if ($prefix_len) {
+ $results[] = array(
+ 'type' => '=',
+ 'text' => substr($o_text, 0, $prefix_len),
+ );
+ }
+
+ if ($prefix_len < $o_len) {
+ $results[] = array(
+ 'type' => '-',
+ 'text' => substr(
+ $o_text,
+ $prefix_len,
+ $o_len - $prefix_len - $suffix_len),
+ );
+ }
+
+ if ($prefix_len < $n_len) {
+ $results[] = array(
+ 'type' => '+',
+ 'text' => substr(
+ $n_text,
+ $prefix_len,
+ $n_len - $prefix_len - $suffix_len),
+ );
+ }
+
+ if ($suffix_len) {
+ $results[] = array(
+ 'type' => '=',
+ 'text' => substr($o_text, -$suffix_len),
+ );
+ }
+
+ return $results;
+ }
+
+ private function mergeParts(array $parts) {
+ $text = '';
+ $type = null;
+ foreach ($parts as $part) {
+ $part_type = $part['type'];
+ if ($type === null) {
+ $type = $part_type;
+ }
+ if ($type !== $part_type) {
+ throw new Exception(pht('Can not merge parts of dissimilar types!'));
+ }
+ $text .= $part['text'];
+ }
+
+ return array(
+ 'type' => $type,
+ 'text' => $text,
+ );
+ }
+
+ private function splitTextForSummary($text) {
+ $matches = null;
+
+ $ok = preg_match('/^(\n*[^\n]+)\n/', $text, $matches);
+ if (!$ok) {
+ return array($text);
+ }
+
+ $head = $matches[1];
+ $text = substr($text, strlen($head));
+
+ $ok = preg_match('/\n([^\n]+\n*)\z/', $text, $matches);
+ if (!$ok) {
+ return array($text);
+ }
+
+ $last = $matches[1];
+ $text = substr($text, 0, -strlen($last));
+
+ if (!strlen(trim($text))) {
+ return array($head, $last);
+ } else {
+ return array($head, $text, $last);
+ }
+ }
+
+}
diff --git a/src/infrastructure/diff/prose/PhutilProseDifferenceEngine.php b/src/infrastructure/diff/prose/PhutilProseDifferenceEngine.php
new file mode 100644
--- /dev/null
+++ b/src/infrastructure/diff/prose/PhutilProseDifferenceEngine.php
@@ -0,0 +1,209 @@
+<?php
+
+final class PhutilProseDifferenceEngine extends Phobject {
+
+ public function getDiff($u, $v) {
+ return $this->buildDiff($u, $v, 0);
+ }
+
+ private function buildDiff($u, $v, $level) {
+ $u_parts = $this->splitCorpus($u, $level);
+ $v_parts = $this->splitCorpus($v, $level);
+
+ $matrix = id(new PhutilEditDistanceMatrix())
+ ->setMaximumLength(128)
+ ->setSequences($u_parts, $v_parts)
+ ->setComputeString(true);
+
+ // For word-level and character-level changes, smooth the output string
+ // to reduce the choppiness of the diff.
+ if ($level > 1) {
+ $matrix->setApplySmoothing(PhutilEditDistanceMatrix::SMOOTHING_FULL);
+ }
+
+ $u_pos = 0;
+ $v_pos = 0;
+
+ $edits = $matrix->getEditString();
+ $edits_length = strlen($edits);
+
+ $diff = new PhutilProseDiff();
+ for ($ii = 0; $ii < $edits_length; $ii++) {
+ $c = $edits[$ii];
+ if ($c == 's') {
+ $diff->addPart('=', $u_parts[$u_pos]);
+ $u_pos++;
+ $v_pos++;
+ } else if ($c == 'd') {
+ $diff->addPart('-', $u_parts[$u_pos]);
+ $u_pos++;
+ } else if ($c == 'i') {
+ $diff->addPart('+', $v_parts[$v_pos]);
+ $v_pos++;
+ } else if ($c == 'x') {
+ $diff->addPart('-', $u_parts[$u_pos]);
+ $diff->addPart('+', $v_parts[$v_pos]);
+ $u_pos++;
+ $v_pos++;
+ } else {
+ throw new Exception(
+ pht(
+ 'Unexpected character ("%s") in edit string.',
+ $c));
+ }
+ }
+
+ $diff->reorderParts();
+
+ // If we just built a character-level diff, we're all done and do not
+ // need to go any deeper.
+ if ($level == 3) {
+ return $diff;
+ }
+
+ $blocks = array();
+ $block = null;
+ foreach ($diff->getParts() as $part) {
+ $type = $part['type'];
+ $text = $part['text'];
+ switch ($type) {
+ case '=':
+ if ($block) {
+ $blocks[] = $block;
+ $block = null;
+ }
+ $blocks[] = array(
+ 'type' => $type,
+ 'text' => $text,
+ );
+ break;
+ case '-':
+ if (!$block) {
+ $block = array(
+ 'type' => '!',
+ 'old' => '',
+ 'new' => '',
+ );
+ }
+ $block['old'] .= $text;
+ break;
+ case '+':
+ if (!$block) {
+ $block = array(
+ 'type' => '!',
+ 'old' => '',
+ 'new' => '',
+ );
+ }
+ $block['new'] .= $text;
+ break;
+ }
+ }
+
+ if ($block) {
+ $blocks[] = $block;
+ }
+
+ $result = new PhutilProseDiff();
+ foreach ($blocks as $block) {
+ $type = $block['type'];
+ if ($type == '=') {
+ $result->addPart('=', $block['text']);
+ } else {
+ $old = $block['old'];
+ $new = $block['new'];
+ if (!strlen($old) && !strlen($new)) {
+ // Nothing to do.
+ } else if (!strlen($old)) {
+ $result->addPart('+', $new);
+ } else if (!strlen($new)) {
+ $result->addPart('-', $old);
+ } else {
+ if ($matrix->didReachMaximumLength()) {
+ // If this text was too big to diff, don't try to subdivide it.
+ $result->addPart('-', $old);
+ $result->addPart('+', $new);
+ } else {
+ $subdiff = $this->buildDiff(
+ $old,
+ $new,
+ $level + 1);
+
+ foreach ($subdiff->getParts() as $part) {
+ $result->addPart($part['type'], $part['text']);
+ }
+ }
+ }
+ }
+ }
+
+ $result->reorderParts();
+
+ return $result;
+ }
+
+ private function splitCorpus($corpus, $level) {
+ switch ($level) {
+ case 0:
+ // Level 0: Split into paragraphs.
+ $expr = '/([\n]+)/';
+ break;
+ case 1:
+ // Level 1: Split into sentences.
+ $expr = '/([\n,!;?\.]+)/';
+ break;
+ case 2:
+ // Level 2: Split into words.
+ $expr = '/(\s+)/';
+ break;
+ case 3:
+ // Level 3: Split into characters.
+ return phutil_utf8v_combined($corpus);
+ }
+
+ $pieces = preg_split($expr, $corpus, -1, PREG_SPLIT_DELIM_CAPTURE);
+ return $this->stitchPieces($pieces, $level);
+ }
+
+ private function stitchPieces(array $pieces, $level) {
+ $results = array();
+ $count = count($pieces);
+ for ($ii = 0; $ii < $count; $ii += 2) {
+ $result = $pieces[$ii];
+ if ($ii + 1 < $count) {
+ $result .= $pieces[$ii + 1];
+ }
+
+ if ($level < 2) {
+ // Split pieces into separate text and whitespace sections: make one
+ // piece out of all the whitespace at the beginning, one piece out of
+ // all the actual text in the middle, and one piece out of all the
+ // whitespace at the end.
+
+ $matches = null;
+ preg_match('/^(\s*)(.*?)(\s*)\z/', $result, $matches);
+
+ if (strlen($matches[1])) {
+ $results[] = $matches[1];
+ }
+ if (strlen($matches[2])) {
+ $results[] = $matches[2];
+ }
+ if (strlen($matches[3])) {
+ $results[] = $matches[3];
+ }
+ } else {
+ $results[] = $result;
+ }
+ }
+
+ // If the input ended with a delimiter, we can get an empty final piece.
+ // Just discard it.
+ if (last($results) == '') {
+ array_pop($results);
+ }
+
+ return $results;
+ }
+
+}
diff --git a/src/infrastructure/diff/prose/__tests__/PhutilProseDiffTestCase.php b/src/infrastructure/diff/prose/__tests__/PhutilProseDiffTestCase.php
new file mode 100644
--- /dev/null
+++ b/src/infrastructure/diff/prose/__tests__/PhutilProseDiffTestCase.php
@@ -0,0 +1,245 @@
+<?php
+
+final class PhutilProseDiffTestCase extends PhutilTestCase {
+
+ public function testProseDiffsDistance() {
+ $this->assertProseParts(
+ '',
+ '',
+ array(),
+ pht('Empty'));
+
+ $this->assertProseParts(
+ "xxx\nyyy",
+ "xxx\nzzz\nyyy",
+ array(
+ "= xxx\n",
+ "+ zzz\n",
+ '= yyy',
+ ),
+ pht('Add Paragraph'));
+
+ $this->assertProseParts(
+ "xxx\nzzz\nyyy",
+ "xxx\nyyy",
+ array(
+ "= xxx\n",
+ "- zzz\n",
+ '= yyy',
+ ),
+ pht('Remove Paragraph'));
+
+
+ // Without smoothing, the alogorithm identifies that "shark" and "cat"
+ // both contain the letter "a" and tries to express this as a very
+ // fine-grained edit which replaces "sh" with "c" and then "rk" with "t".
+ // This is technically correct, but it is much easier for human viewers to
+ // parse if we smooth this into a single removal and a single addition.
+
+ $this->assertProseParts(
+ 'They say the shark has nine lives.',
+ 'They say the cat has nine lives.',
+ array(
+ '= They say the ',
+ '- shark',
+ '+ cat',
+ '= has nine lives.',
+ ),
+ pht('"Shark/cat" word edit smoothenss.'));
+
+ $this->assertProseParts(
+ 'Rising quickly, she says',
+ 'Rising quickly, she remarks:',
+ array(
+ '= Rising quickly, she ',
+ '- says',
+ '+ remarks:',
+ ),
+ pht('"Says/remarks" word edit smoothenss.'));
+
+ $this->assertProseParts(
+ 'See screenshots',
+ 'Viewed video files',
+ array(
+ '- See screenshots',
+ '+ Viewed video files',
+ ),
+ pht('Complete paragraph rewrite.'));
+
+ $this->assertProseParts(
+ 'xaaax',
+ 'xbbbx',
+ array(
+ '- xaaax',
+ '+ xbbbx',
+ ),
+ pht('Whole word rewrite with common prefix and suffix.'));
+
+ $this->assertProseParts(
+ ' aaa ',
+ ' bbb ',
+ array(
+ '= ',
+ '- aaa',
+ '+ bbb',
+ '= ',
+ ),
+ pht('Whole word rewrite with whitespace prefix and suffix.'));
+
+ $this->assertSummaryProseParts(
+ "a\nb\nc\nd\ne\nf\ng\nh\n",
+ "a\nb\nc\nd\nX\nf\ng\nh\n",
+ array(
+ '.',
+ "= d\n",
+ '- e',
+ '+ X',
+ "= \nf",
+ '.',
+ ),
+ pht('Summary diff with middle change.'));
+
+ $this->assertSummaryProseParts(
+ "a\nb\nc\nd\ne\nf\ng\nh\n",
+ "X\nb\nc\nd\ne\nf\ng\nh\n",
+ array(
+ '- a',
+ '+ X',
+ "= \nb",
+ '.',
+ ),
+ pht('Summary diff with head change.'));
+
+ $this->assertSummaryProseParts(
+ "a\nb\nc\nd\ne\nf\ng\nh\n",
+ "a\nb\nc\nd\ne\nf\ng\nX\n",
+ array(
+ '.',
+ "= g\n",
+ '- h',
+ '+ X',
+ "= \n",
+ ),
+ pht('Summary diff with last change.'));
+
+ $this->assertProseParts(
+ 'aaa aaa aaa aaa, bbb bbb bbb bbb.',
+ "aaa aaa aaa aaa, bbb bbb bbb bbb.\n\n- ccc ccc ccc",
+ array(
+ '= aaa aaa aaa aaa, bbb bbb bbb bbb.',
+ "+ \n\n- ccc ccc ccc",
+ ),
+ pht('Diff with new trailing content.'));
+
+ $this->assertProseParts(
+ 'aaa aaa aaa aaa, bbb bbb bbb bbb.',
+ 'aaa aaa aaa aaa bbb bbb bbb bbb.',
+ array(
+ '= aaa aaa aaa aaa',
+ '- ,',
+ '= bbb bbb bbb bbb.',
+ ),
+ pht('Diff with a removed comma.'));
+
+ $this->assertProseParts(
+ 'aaa aaa aaa aaa, bbb bbb bbb bbb.',
+ "aaa aaa aaa aaa bbb bbb bbb bbb.\n\n- ccc ccc ccc!",
+ array(
+ '= aaa aaa aaa aaa',
+ '- ,',
+ '= bbb bbb bbb bbb.',
+ "+ \n\n- ccc ccc ccc!",
+ ),
+ pht('Diff with a removed comma and new trailing content.'));
+
+ $this->assertProseParts(
+ '[ ] Walnuts',
+ '[X] Walnuts',
+ array(
+ '= [',
+ '- ',
+ '+ X',
+ '= ] Walnuts',
+ ),
+ pht('Diff adding a tickmark to a checkbox list.'));
+
+ $this->assertProseParts(
+ '[[ ./week49 ]]',
+ '[[ ./week50 ]]',
+ array(
+ '= [[ ./week',
+ '- 49',
+ '+ 50',
+ '= ]]',
+ ),
+ pht('Diff changing a remarkup wiki link target.'));
+
+ // Create a large corpus with many sentences and paragraphs.
+ $large_paragraph = 'xyz. ';
+ $large_paragraph = str_repeat($large_paragraph, 50);
+ $large_paragraph = rtrim($large_paragraph);
+
+ $large_corpus = $large_paragraph."\n\n";
+ $large_corpus = str_repeat($large_corpus, 50);
+ $large_corpus = rtrim($large_corpus);
+
+ $this->assertProseParts(
+ $large_corpus,
+ "aaa\n\n".$large_corpus."\n\nzzz",
+ array(
+ "+ aaa\n\n",
+ '= '.$large_corpus,
+ "+ \n\nzzz",
+ ),
+ pht('Adding initial and final lines to a large corpus.'));
+
+ }
+
+ private function assertProseParts($old, $new, array $expect_parts, $label) {
+ $engine = new PhutilProseDifferenceEngine();
+ $diff = $engine->getDiff($old, $new);
+
+ $parts = $diff->getParts();
+
+ $this->assertParts($expect_parts, $parts, $label);
+ }
+
+ private function assertSummaryProseParts(
+ $old,
+ $new,
+ array $expect_parts,
+ $label) {
+
+ $engine = new PhutilProseDifferenceEngine();
+ $diff = $engine->getDiff($old, $new);
+
+ $parts = $diff->getSummaryParts();
+
+ $this->assertParts($expect_parts, $parts, $label);
+ }
+
+ private function assertParts(
+ array $expect,
+ array $actual_parts,
+ $label) {
+
+ $actual = array();
+ foreach ($actual_parts as $actual_part) {
+ $type = $actual_part['type'];
+ $text = $actual_part['text'];
+
+ switch ($type) {
+ case '.':
+ $actual[] = $type;
+ break;
+ default:
+ $actual[] = "{$type} {$text}";
+ break;
+ }
+ }
+
+ $this->assertEqual($expect, $actual, $label);
+ }
+
+
+}

File Metadata

Mime Type
text/plain
Expires
Sun, Mar 23, 8:41 AM (1 d, 9 h ago)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
7718229
Default Alt Text
D20838.diff (20 KB)

Event Timeline