Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F15422483
D20838.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
20 KB
Referenced Files
None
Subscribers
None
D20838.diff
View Options
diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php
--- a/src/__phutil_library_map__.php
+++ b/src/__phutil_library_map__.php
@@ -5599,6 +5599,9 @@
'PhutilOAuthAuthAdapter' => 'applications/auth/adapter/PhutilOAuthAuthAdapter.php',
'PhutilPHPCodeSnippetContextFreeGrammar' => 'infrastructure/lipsum/code/PhutilPHPCodeSnippetContextFreeGrammar.php',
'PhutilPhabricatorAuthAdapter' => 'applications/auth/adapter/PhutilPhabricatorAuthAdapter.php',
+ 'PhutilProseDiff' => 'infrastructure/diff/prose/PhutilProseDiff.php',
+ 'PhutilProseDiffTestCase' => 'infrastructure/diff/prose/__tests__/PhutilProseDiffTestCase.php',
+ 'PhutilProseDifferenceEngine' => 'infrastructure/diff/prose/PhutilProseDifferenceEngine.php',
'PhutilQsprintfInterface' => 'infrastructure/storage/xsprintf/PhutilQsprintfInterface.php',
'PhutilQueryString' => 'infrastructure/storage/xsprintf/PhutilQueryString.php',
'PhutilRealNameContextFreeGrammar' => 'infrastructure/lipsum/PhutilRealNameContextFreeGrammar.php',
@@ -12398,6 +12401,9 @@
'PhutilOAuthAuthAdapter' => 'PhutilAuthAdapter',
'PhutilPHPCodeSnippetContextFreeGrammar' => 'PhutilCLikeCodeSnippetContextFreeGrammar',
'PhutilPhabricatorAuthAdapter' => 'PhutilOAuthAuthAdapter',
+ 'PhutilProseDiff' => 'Phobject',
+ 'PhutilProseDiffTestCase' => 'PhutilTestCase',
+ 'PhutilProseDifferenceEngine' => 'Phobject',
'PhutilQueryString' => 'Phobject',
'PhutilRealNameContextFreeGrammar' => 'PhutilContextFreeGrammar',
'PhutilRemarkupAnchorRule' => 'PhutilRemarkupRule',
diff --git a/src/infrastructure/diff/prose/PhutilProseDiff.php b/src/infrastructure/diff/prose/PhutilProseDiff.php
new file mode 100644
--- /dev/null
+++ b/src/infrastructure/diff/prose/PhutilProseDiff.php
@@ -0,0 +1,292 @@
+<?php
+
+final class PhutilProseDiff extends Phobject {
+
+ private $parts = array();
+
+ public function addPart($type, $text) {
+ $this->parts[] = array(
+ 'type' => $type,
+ 'text' => $text,
+ );
+ return $this;
+ }
+
+ public function getParts() {
+ return $this->parts;
+ }
+
+ /**
+ * Get diff parts, but replace large blocks of unchanged text with "."
+ * parts representing missing context.
+ */
+ public function getSummaryParts() {
+ $parts = $this->getParts();
+
+ $head_key = head_key($parts);
+ $last_key = last_key($parts);
+
+ $results = array();
+ foreach ($parts as $key => $part) {
+ $is_head = ($key == $head_key);
+ $is_last = ($key == $last_key);
+
+ switch ($part['type']) {
+ case '=':
+ $pieces = $this->splitTextForSummary($part['text']);
+
+ if ($is_head || $is_last) {
+ $need = 2;
+ } else {
+ $need = 3;
+ }
+
+ // We don't have enough pieces to omit anything, so just continue.
+ if (count($pieces) < $need) {
+ $results[] = $part;
+ break;
+ }
+
+ if (!$is_head) {
+ $results[] = array(
+ 'type' => '=',
+ 'text' => head($pieces),
+ );
+ }
+
+ $results[] = array(
+ 'type' => '.',
+ 'text' => null,
+ );
+
+ if (!$is_last) {
+ $results[] = array(
+ 'type' => '=',
+ 'text' => last($pieces),
+ );
+ }
+ break;
+ default:
+ $results[] = $part;
+ break;
+ }
+ }
+
+ return $results;
+ }
+
+
+ public function reorderParts() {
+ // Reorder sequences of removed and added sections to put all the "-"
+ // parts together first, then all the "+" parts together. This produces
+ // a more human-readable result than intermingling them.
+
+ $o_run = array();
+ $n_run = array();
+ $result = array();
+ foreach ($this->parts as $part) {
+ $type = $part['type'];
+ switch ($type) {
+ case '-':
+ $o_run[] = $part;
+ break;
+ case '+':
+ $n_run[] = $part;
+ break;
+ default:
+ if ($o_run || $n_run) {
+ foreach ($this->combineRuns($o_run, $n_run) as $merged_part) {
+ $result[] = $merged_part;
+ }
+ $o_run = array();
+ $n_run = array();
+ }
+ $result[] = $part;
+ break;
+ }
+ }
+
+ if ($o_run || $n_run) {
+ foreach ($this->combineRuns($o_run, $n_run) as $part) {
+ $result[] = $part;
+ }
+ }
+
+ // Now, combine consecuitive runs of the same type of change (like a
+ // series of "-" parts) into a single run.
+ $combined = array();
+
+ $last = null;
+ $last_text = null;
+ foreach ($result as $part) {
+ $type = $part['type'];
+
+ if ($last !== $type) {
+ if ($last !== null) {
+ $combined[] = array(
+ 'type' => $last,
+ 'text' => $last_text,
+ );
+ }
+ $last_text = null;
+ $last = $type;
+ }
+
+ $last_text .= $part['text'];
+ }
+
+ if ($last_text !== null) {
+ $combined[] = array(
+ 'type' => $last,
+ 'text' => $last_text,
+ );
+ }
+
+ $this->parts = $combined;
+
+ return $this;
+ }
+
+ private function combineRuns($o_run, $n_run) {
+ $o_merge = $this->mergeParts($o_run);
+ $n_merge = $this->mergeParts($n_run);
+
+ // When removed and added blocks share a prefix or suffix, we sometimes
+ // want to count it as unchanged (for example, if it is whitespace) but
+ // sometimes want to count it as changed (for example, if it is a word
+ // suffix like "ing"). Find common prefixes and suffixes of these layout
+ // characters and emit them as "=" (unchanged) blocks.
+
+ $layout_characters = array(
+ ' ' => true,
+ "\n" => true,
+ '.' => true,
+ '!' => true,
+ ',' => true,
+ '?' => true,
+ ']' => true,
+ '[' => true,
+ '(' => true,
+ ')' => true,
+ '<' => true,
+ '>' => true,
+ );
+
+ $o_text = $o_merge['text'];
+ $n_text = $n_merge['text'];
+ $o_len = strlen($o_text);
+ $n_len = strlen($n_text);
+ $min_len = min($o_len, $n_len);
+
+ $prefix_len = 0;
+ for ($pos = 0; $pos < $min_len; $pos++) {
+ $o = $o_text[$pos];
+ $n = $n_text[$pos];
+ if ($o !== $n) {
+ break;
+ }
+ if (empty($layout_characters[$o])) {
+ break;
+ }
+ $prefix_len++;
+ }
+
+ $suffix_len = 0;
+ for ($pos = 0; $pos < ($min_len - $prefix_len); $pos++) {
+ $o = $o_text[$o_len - ($pos + 1)];
+ $n = $n_text[$n_len - ($pos + 1)];
+ if ($o !== $n) {
+ break;
+ }
+ if (empty($layout_characters[$o])) {
+ break;
+ }
+ $suffix_len++;
+ }
+
+ $results = array();
+
+ if ($prefix_len) {
+ $results[] = array(
+ 'type' => '=',
+ 'text' => substr($o_text, 0, $prefix_len),
+ );
+ }
+
+ if ($prefix_len < $o_len) {
+ $results[] = array(
+ 'type' => '-',
+ 'text' => substr(
+ $o_text,
+ $prefix_len,
+ $o_len - $prefix_len - $suffix_len),
+ );
+ }
+
+ if ($prefix_len < $n_len) {
+ $results[] = array(
+ 'type' => '+',
+ 'text' => substr(
+ $n_text,
+ $prefix_len,
+ $n_len - $prefix_len - $suffix_len),
+ );
+ }
+
+ if ($suffix_len) {
+ $results[] = array(
+ 'type' => '=',
+ 'text' => substr($o_text, -$suffix_len),
+ );
+ }
+
+ return $results;
+ }
+
+ private function mergeParts(array $parts) {
+ $text = '';
+ $type = null;
+ foreach ($parts as $part) {
+ $part_type = $part['type'];
+ if ($type === null) {
+ $type = $part_type;
+ }
+ if ($type !== $part_type) {
+ throw new Exception(pht('Can not merge parts of dissimilar types!'));
+ }
+ $text .= $part['text'];
+ }
+
+ return array(
+ 'type' => $type,
+ 'text' => $text,
+ );
+ }
+
+ private function splitTextForSummary($text) {
+ $matches = null;
+
+ $ok = preg_match('/^(\n*[^\n]+)\n/', $text, $matches);
+ if (!$ok) {
+ return array($text);
+ }
+
+ $head = $matches[1];
+ $text = substr($text, strlen($head));
+
+ $ok = preg_match('/\n([^\n]+\n*)\z/', $text, $matches);
+ if (!$ok) {
+ return array($text);
+ }
+
+ $last = $matches[1];
+ $text = substr($text, 0, -strlen($last));
+
+ if (!strlen(trim($text))) {
+ return array($head, $last);
+ } else {
+ return array($head, $text, $last);
+ }
+ }
+
+}
diff --git a/src/infrastructure/diff/prose/PhutilProseDifferenceEngine.php b/src/infrastructure/diff/prose/PhutilProseDifferenceEngine.php
new file mode 100644
--- /dev/null
+++ b/src/infrastructure/diff/prose/PhutilProseDifferenceEngine.php
@@ -0,0 +1,209 @@
+<?php
+
+final class PhutilProseDifferenceEngine extends Phobject {
+
+ public function getDiff($u, $v) {
+ return $this->buildDiff($u, $v, 0);
+ }
+
+ private function buildDiff($u, $v, $level) {
+ $u_parts = $this->splitCorpus($u, $level);
+ $v_parts = $this->splitCorpus($v, $level);
+
+ $matrix = id(new PhutilEditDistanceMatrix())
+ ->setMaximumLength(128)
+ ->setSequences($u_parts, $v_parts)
+ ->setComputeString(true);
+
+ // For word-level and character-level changes, smooth the output string
+ // to reduce the choppiness of the diff.
+ if ($level > 1) {
+ $matrix->setApplySmoothing(PhutilEditDistanceMatrix::SMOOTHING_FULL);
+ }
+
+ $u_pos = 0;
+ $v_pos = 0;
+
+ $edits = $matrix->getEditString();
+ $edits_length = strlen($edits);
+
+ $diff = new PhutilProseDiff();
+ for ($ii = 0; $ii < $edits_length; $ii++) {
+ $c = $edits[$ii];
+ if ($c == 's') {
+ $diff->addPart('=', $u_parts[$u_pos]);
+ $u_pos++;
+ $v_pos++;
+ } else if ($c == 'd') {
+ $diff->addPart('-', $u_parts[$u_pos]);
+ $u_pos++;
+ } else if ($c == 'i') {
+ $diff->addPart('+', $v_parts[$v_pos]);
+ $v_pos++;
+ } else if ($c == 'x') {
+ $diff->addPart('-', $u_parts[$u_pos]);
+ $diff->addPart('+', $v_parts[$v_pos]);
+ $u_pos++;
+ $v_pos++;
+ } else {
+ throw new Exception(
+ pht(
+ 'Unexpected character ("%s") in edit string.',
+ $c));
+ }
+ }
+
+ $diff->reorderParts();
+
+ // If we just built a character-level diff, we're all done and do not
+ // need to go any deeper.
+ if ($level == 3) {
+ return $diff;
+ }
+
+ $blocks = array();
+ $block = null;
+ foreach ($diff->getParts() as $part) {
+ $type = $part['type'];
+ $text = $part['text'];
+ switch ($type) {
+ case '=':
+ if ($block) {
+ $blocks[] = $block;
+ $block = null;
+ }
+ $blocks[] = array(
+ 'type' => $type,
+ 'text' => $text,
+ );
+ break;
+ case '-':
+ if (!$block) {
+ $block = array(
+ 'type' => '!',
+ 'old' => '',
+ 'new' => '',
+ );
+ }
+ $block['old'] .= $text;
+ break;
+ case '+':
+ if (!$block) {
+ $block = array(
+ 'type' => '!',
+ 'old' => '',
+ 'new' => '',
+ );
+ }
+ $block['new'] .= $text;
+ break;
+ }
+ }
+
+ if ($block) {
+ $blocks[] = $block;
+ }
+
+ $result = new PhutilProseDiff();
+ foreach ($blocks as $block) {
+ $type = $block['type'];
+ if ($type == '=') {
+ $result->addPart('=', $block['text']);
+ } else {
+ $old = $block['old'];
+ $new = $block['new'];
+ if (!strlen($old) && !strlen($new)) {
+ // Nothing to do.
+ } else if (!strlen($old)) {
+ $result->addPart('+', $new);
+ } else if (!strlen($new)) {
+ $result->addPart('-', $old);
+ } else {
+ if ($matrix->didReachMaximumLength()) {
+ // If this text was too big to diff, don't try to subdivide it.
+ $result->addPart('-', $old);
+ $result->addPart('+', $new);
+ } else {
+ $subdiff = $this->buildDiff(
+ $old,
+ $new,
+ $level + 1);
+
+ foreach ($subdiff->getParts() as $part) {
+ $result->addPart($part['type'], $part['text']);
+ }
+ }
+ }
+ }
+ }
+
+ $result->reorderParts();
+
+ return $result;
+ }
+
+ private function splitCorpus($corpus, $level) {
+ switch ($level) {
+ case 0:
+ // Level 0: Split into paragraphs.
+ $expr = '/([\n]+)/';
+ break;
+ case 1:
+ // Level 1: Split into sentences.
+ $expr = '/([\n,!;?\.]+)/';
+ break;
+ case 2:
+ // Level 2: Split into words.
+ $expr = '/(\s+)/';
+ break;
+ case 3:
+ // Level 3: Split into characters.
+ return phutil_utf8v_combined($corpus);
+ }
+
+ $pieces = preg_split($expr, $corpus, -1, PREG_SPLIT_DELIM_CAPTURE);
+ return $this->stitchPieces($pieces, $level);
+ }
+
+ private function stitchPieces(array $pieces, $level) {
+ $results = array();
+ $count = count($pieces);
+ for ($ii = 0; $ii < $count; $ii += 2) {
+ $result = $pieces[$ii];
+ if ($ii + 1 < $count) {
+ $result .= $pieces[$ii + 1];
+ }
+
+ if ($level < 2) {
+ // Split pieces into separate text and whitespace sections: make one
+ // piece out of all the whitespace at the beginning, one piece out of
+ // all the actual text in the middle, and one piece out of all the
+ // whitespace at the end.
+
+ $matches = null;
+ preg_match('/^(\s*)(.*?)(\s*)\z/', $result, $matches);
+
+ if (strlen($matches[1])) {
+ $results[] = $matches[1];
+ }
+ if (strlen($matches[2])) {
+ $results[] = $matches[2];
+ }
+ if (strlen($matches[3])) {
+ $results[] = $matches[3];
+ }
+ } else {
+ $results[] = $result;
+ }
+ }
+
+ // If the input ended with a delimiter, we can get an empty final piece.
+ // Just discard it.
+ if (last($results) == '') {
+ array_pop($results);
+ }
+
+ return $results;
+ }
+
+}
diff --git a/src/infrastructure/diff/prose/__tests__/PhutilProseDiffTestCase.php b/src/infrastructure/diff/prose/__tests__/PhutilProseDiffTestCase.php
new file mode 100644
--- /dev/null
+++ b/src/infrastructure/diff/prose/__tests__/PhutilProseDiffTestCase.php
@@ -0,0 +1,245 @@
+<?php
+
+final class PhutilProseDiffTestCase extends PhutilTestCase {
+
+ public function testProseDiffsDistance() {
+ $this->assertProseParts(
+ '',
+ '',
+ array(),
+ pht('Empty'));
+
+ $this->assertProseParts(
+ "xxx\nyyy",
+ "xxx\nzzz\nyyy",
+ array(
+ "= xxx\n",
+ "+ zzz\n",
+ '= yyy',
+ ),
+ pht('Add Paragraph'));
+
+ $this->assertProseParts(
+ "xxx\nzzz\nyyy",
+ "xxx\nyyy",
+ array(
+ "= xxx\n",
+ "- zzz\n",
+ '= yyy',
+ ),
+ pht('Remove Paragraph'));
+
+
+ // Without smoothing, the alogorithm identifies that "shark" and "cat"
+ // both contain the letter "a" and tries to express this as a very
+ // fine-grained edit which replaces "sh" with "c" and then "rk" with "t".
+ // This is technically correct, but it is much easier for human viewers to
+ // parse if we smooth this into a single removal and a single addition.
+
+ $this->assertProseParts(
+ 'They say the shark has nine lives.',
+ 'They say the cat has nine lives.',
+ array(
+ '= They say the ',
+ '- shark',
+ '+ cat',
+ '= has nine lives.',
+ ),
+ pht('"Shark/cat" word edit smoothenss.'));
+
+ $this->assertProseParts(
+ 'Rising quickly, she says',
+ 'Rising quickly, she remarks:',
+ array(
+ '= Rising quickly, she ',
+ '- says',
+ '+ remarks:',
+ ),
+ pht('"Says/remarks" word edit smoothenss.'));
+
+ $this->assertProseParts(
+ 'See screenshots',
+ 'Viewed video files',
+ array(
+ '- See screenshots',
+ '+ Viewed video files',
+ ),
+ pht('Complete paragraph rewrite.'));
+
+ $this->assertProseParts(
+ 'xaaax',
+ 'xbbbx',
+ array(
+ '- xaaax',
+ '+ xbbbx',
+ ),
+ pht('Whole word rewrite with common prefix and suffix.'));
+
+ $this->assertProseParts(
+ ' aaa ',
+ ' bbb ',
+ array(
+ '= ',
+ '- aaa',
+ '+ bbb',
+ '= ',
+ ),
+ pht('Whole word rewrite with whitespace prefix and suffix.'));
+
+ $this->assertSummaryProseParts(
+ "a\nb\nc\nd\ne\nf\ng\nh\n",
+ "a\nb\nc\nd\nX\nf\ng\nh\n",
+ array(
+ '.',
+ "= d\n",
+ '- e',
+ '+ X',
+ "= \nf",
+ '.',
+ ),
+ pht('Summary diff with middle change.'));
+
+ $this->assertSummaryProseParts(
+ "a\nb\nc\nd\ne\nf\ng\nh\n",
+ "X\nb\nc\nd\ne\nf\ng\nh\n",
+ array(
+ '- a',
+ '+ X',
+ "= \nb",
+ '.',
+ ),
+ pht('Summary diff with head change.'));
+
+ $this->assertSummaryProseParts(
+ "a\nb\nc\nd\ne\nf\ng\nh\n",
+ "a\nb\nc\nd\ne\nf\ng\nX\n",
+ array(
+ '.',
+ "= g\n",
+ '- h',
+ '+ X',
+ "= \n",
+ ),
+ pht('Summary diff with last change.'));
+
+ $this->assertProseParts(
+ 'aaa aaa aaa aaa, bbb bbb bbb bbb.',
+ "aaa aaa aaa aaa, bbb bbb bbb bbb.\n\n- ccc ccc ccc",
+ array(
+ '= aaa aaa aaa aaa, bbb bbb bbb bbb.',
+ "+ \n\n- ccc ccc ccc",
+ ),
+ pht('Diff with new trailing content.'));
+
+ $this->assertProseParts(
+ 'aaa aaa aaa aaa, bbb bbb bbb bbb.',
+ 'aaa aaa aaa aaa bbb bbb bbb bbb.',
+ array(
+ '= aaa aaa aaa aaa',
+ '- ,',
+ '= bbb bbb bbb bbb.',
+ ),
+ pht('Diff with a removed comma.'));
+
+ $this->assertProseParts(
+ 'aaa aaa aaa aaa, bbb bbb bbb bbb.',
+ "aaa aaa aaa aaa bbb bbb bbb bbb.\n\n- ccc ccc ccc!",
+ array(
+ '= aaa aaa aaa aaa',
+ '- ,',
+ '= bbb bbb bbb bbb.',
+ "+ \n\n- ccc ccc ccc!",
+ ),
+ pht('Diff with a removed comma and new trailing content.'));
+
+ $this->assertProseParts(
+ '[ ] Walnuts',
+ '[X] Walnuts',
+ array(
+ '= [',
+ '- ',
+ '+ X',
+ '= ] Walnuts',
+ ),
+ pht('Diff adding a tickmark to a checkbox list.'));
+
+ $this->assertProseParts(
+ '[[ ./week49 ]]',
+ '[[ ./week50 ]]',
+ array(
+ '= [[ ./week',
+ '- 49',
+ '+ 50',
+ '= ]]',
+ ),
+ pht('Diff changing a remarkup wiki link target.'));
+
+ // Create a large corpus with many sentences and paragraphs.
+ $large_paragraph = 'xyz. ';
+ $large_paragraph = str_repeat($large_paragraph, 50);
+ $large_paragraph = rtrim($large_paragraph);
+
+ $large_corpus = $large_paragraph."\n\n";
+ $large_corpus = str_repeat($large_corpus, 50);
+ $large_corpus = rtrim($large_corpus);
+
+ $this->assertProseParts(
+ $large_corpus,
+ "aaa\n\n".$large_corpus."\n\nzzz",
+ array(
+ "+ aaa\n\n",
+ '= '.$large_corpus,
+ "+ \n\nzzz",
+ ),
+ pht('Adding initial and final lines to a large corpus.'));
+
+ }
+
+ private function assertProseParts($old, $new, array $expect_parts, $label) {
+ $engine = new PhutilProseDifferenceEngine();
+ $diff = $engine->getDiff($old, $new);
+
+ $parts = $diff->getParts();
+
+ $this->assertParts($expect_parts, $parts, $label);
+ }
+
+ private function assertSummaryProseParts(
+ $old,
+ $new,
+ array $expect_parts,
+ $label) {
+
+ $engine = new PhutilProseDifferenceEngine();
+ $diff = $engine->getDiff($old, $new);
+
+ $parts = $diff->getSummaryParts();
+
+ $this->assertParts($expect_parts, $parts, $label);
+ }
+
+ private function assertParts(
+ array $expect,
+ array $actual_parts,
+ $label) {
+
+ $actual = array();
+ foreach ($actual_parts as $actual_part) {
+ $type = $actual_part['type'];
+ $text = $actual_part['text'];
+
+ switch ($type) {
+ case '.':
+ $actual[] = $type;
+ break;
+ default:
+ $actual[] = "{$type} {$text}";
+ break;
+ }
+ }
+
+ $this->assertEqual($expect, $actual, $label);
+ }
+
+
+}
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sun, Mar 23, 8:41 AM (1 d, 9 h ago)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
7718229
Default Alt Text
D20838.diff (20 KB)
Attached To
Mode
D20838: Add "PhutilProseDiff" classes to "phabricator/"
Attached
Detach File
Event Timeline
Log In to Comment