diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php --- a/src/__phutil_library_map__.php +++ b/src/__phutil_library_map__.php @@ -5597,6 +5597,9 @@ 'PhutilOAuthAuthAdapter' => 'applications/auth/adapter/PhutilOAuthAuthAdapter.php', 'PhutilPHPCodeSnippetContextFreeGrammar' => 'infrastructure/lipsum/code/PhutilPHPCodeSnippetContextFreeGrammar.php', 'PhutilPhabricatorAuthAdapter' => 'applications/auth/adapter/PhutilPhabricatorAuthAdapter.php', + 'PhutilProseDiff' => 'infrastructure/diff/prose/PhutilProseDiff.php', + 'PhutilProseDiffTestCase' => 'infrastructure/diff/prose/__tests__/PhutilProseDiffTestCase.php', + 'PhutilProseDifferenceEngine' => 'infrastructure/diff/prose/PhutilProseDifferenceEngine.php', 'PhutilQsprintfInterface' => 'infrastructure/storage/xsprintf/PhutilQsprintfInterface.php', 'PhutilQueryString' => 'infrastructure/storage/xsprintf/PhutilQueryString.php', 'PhutilRealNameContextFreeGrammar' => 'infrastructure/lipsum/PhutilRealNameContextFreeGrammar.php', @@ -12393,6 +12396,9 @@ 'PhutilOAuthAuthAdapter' => 'PhutilAuthAdapter', 'PhutilPHPCodeSnippetContextFreeGrammar' => 'PhutilCLikeCodeSnippetContextFreeGrammar', 'PhutilPhabricatorAuthAdapter' => 'PhutilOAuthAuthAdapter', + 'PhutilProseDiff' => 'Phobject', + 'PhutilProseDiffTestCase' => 'PhutilTestCase', + 'PhutilProseDifferenceEngine' => 'Phobject', 'PhutilQueryString' => 'Phobject', 'PhutilRealNameContextFreeGrammar' => 'PhutilContextFreeGrammar', 'PhutilRemarkupBlockInterpreter' => 'Phobject', diff --git a/src/infrastructure/diff/prose/PhutilProseDiff.php b/src/infrastructure/diff/prose/PhutilProseDiff.php new file mode 100644 --- /dev/null +++ b/src/infrastructure/diff/prose/PhutilProseDiff.php @@ -0,0 +1,292 @@ +parts[] = array( + 'type' => $type, + 'text' => $text, + ); + return $this; + } + + public function getParts() { + return $this->parts; + } + + /** + * Get diff parts, but replace large blocks of unchanged text with "." + * parts representing missing context. + */ + public function getSummaryParts() { + $parts = $this->getParts(); + + $head_key = head_key($parts); + $last_key = last_key($parts); + + $results = array(); + foreach ($parts as $key => $part) { + $is_head = ($key == $head_key); + $is_last = ($key == $last_key); + + switch ($part['type']) { + case '=': + $pieces = $this->splitTextForSummary($part['text']); + + if ($is_head || $is_last) { + $need = 2; + } else { + $need = 3; + } + + // We don't have enough pieces to omit anything, so just continue. + if (count($pieces) < $need) { + $results[] = $part; + break; + } + + if (!$is_head) { + $results[] = array( + 'type' => '=', + 'text' => head($pieces), + ); + } + + $results[] = array( + 'type' => '.', + 'text' => null, + ); + + if (!$is_last) { + $results[] = array( + 'type' => '=', + 'text' => last($pieces), + ); + } + break; + default: + $results[] = $part; + break; + } + } + + return $results; + } + + + public function reorderParts() { + // Reorder sequences of removed and added sections to put all the "-" + // parts together first, then all the "+" parts together. This produces + // a more human-readable result than intermingling them. + + $o_run = array(); + $n_run = array(); + $result = array(); + foreach ($this->parts as $part) { + $type = $part['type']; + switch ($type) { + case '-': + $o_run[] = $part; + break; + case '+': + $n_run[] = $part; + break; + default: + if ($o_run || $n_run) { + foreach ($this->combineRuns($o_run, $n_run) as $merged_part) { + $result[] = $merged_part; + } + $o_run = array(); + $n_run = array(); + } + $result[] = $part; + break; + } + } + + if ($o_run || $n_run) { + foreach ($this->combineRuns($o_run, $n_run) as $part) { + $result[] = $part; + } + } + + // Now, combine consecuitive runs of the same type of change (like a + // series of "-" parts) into a single run. + $combined = array(); + + $last = null; + $last_text = null; + foreach ($result as $part) { + $type = $part['type']; + + if ($last !== $type) { + if ($last !== null) { + $combined[] = array( + 'type' => $last, + 'text' => $last_text, + ); + } + $last_text = null; + $last = $type; + } + + $last_text .= $part['text']; + } + + if ($last_text !== null) { + $combined[] = array( + 'type' => $last, + 'text' => $last_text, + ); + } + + $this->parts = $combined; + + return $this; + } + + private function combineRuns($o_run, $n_run) { + $o_merge = $this->mergeParts($o_run); + $n_merge = $this->mergeParts($n_run); + + // When removed and added blocks share a prefix or suffix, we sometimes + // want to count it as unchanged (for example, if it is whitespace) but + // sometimes want to count it as changed (for example, if it is a word + // suffix like "ing"). Find common prefixes and suffixes of these layout + // characters and emit them as "=" (unchanged) blocks. + + $layout_characters = array( + ' ' => true, + "\n" => true, + '.' => true, + '!' => true, + ',' => true, + '?' => true, + ']' => true, + '[' => true, + '(' => true, + ')' => true, + '<' => true, + '>' => true, + ); + + $o_text = $o_merge['text']; + $n_text = $n_merge['text']; + $o_len = strlen($o_text); + $n_len = strlen($n_text); + $min_len = min($o_len, $n_len); + + $prefix_len = 0; + for ($pos = 0; $pos < $min_len; $pos++) { + $o = $o_text[$pos]; + $n = $n_text[$pos]; + if ($o !== $n) { + break; + } + if (empty($layout_characters[$o])) { + break; + } + $prefix_len++; + } + + $suffix_len = 0; + for ($pos = 0; $pos < ($min_len - $prefix_len); $pos++) { + $o = $o_text[$o_len - ($pos + 1)]; + $n = $n_text[$n_len - ($pos + 1)]; + if ($o !== $n) { + break; + } + if (empty($layout_characters[$o])) { + break; + } + $suffix_len++; + } + + $results = array(); + + if ($prefix_len) { + $results[] = array( + 'type' => '=', + 'text' => substr($o_text, 0, $prefix_len), + ); + } + + if ($prefix_len < $o_len) { + $results[] = array( + 'type' => '-', + 'text' => substr( + $o_text, + $prefix_len, + $o_len - $prefix_len - $suffix_len), + ); + } + + if ($prefix_len < $n_len) { + $results[] = array( + 'type' => '+', + 'text' => substr( + $n_text, + $prefix_len, + $n_len - $prefix_len - $suffix_len), + ); + } + + if ($suffix_len) { + $results[] = array( + 'type' => '=', + 'text' => substr($o_text, -$suffix_len), + ); + } + + return $results; + } + + private function mergeParts(array $parts) { + $text = ''; + $type = null; + foreach ($parts as $part) { + $part_type = $part['type']; + if ($type === null) { + $type = $part_type; + } + if ($type !== $part_type) { + throw new Exception(pht('Can not merge parts of dissimilar types!')); + } + $text .= $part['text']; + } + + return array( + 'type' => $type, + 'text' => $text, + ); + } + + private function splitTextForSummary($text) { + $matches = null; + + $ok = preg_match('/^(\n*[^\n]+)\n/', $text, $matches); + if (!$ok) { + return array($text); + } + + $head = $matches[1]; + $text = substr($text, strlen($head)); + + $ok = preg_match('/\n([^\n]+\n*)\z/', $text, $matches); + if (!$ok) { + return array($text); + } + + $last = $matches[1]; + $text = substr($text, 0, -strlen($last)); + + if (!strlen(trim($text))) { + return array($head, $last); + } else { + return array($head, $text, $last); + } + } + +} diff --git a/src/infrastructure/diff/prose/PhutilProseDifferenceEngine.php b/src/infrastructure/diff/prose/PhutilProseDifferenceEngine.php new file mode 100644 --- /dev/null +++ b/src/infrastructure/diff/prose/PhutilProseDifferenceEngine.php @@ -0,0 +1,209 @@ +buildDiff($u, $v, 0); + } + + private function buildDiff($u, $v, $level) { + $u_parts = $this->splitCorpus($u, $level); + $v_parts = $this->splitCorpus($v, $level); + + $matrix = id(new PhutilEditDistanceMatrix()) + ->setMaximumLength(128) + ->setSequences($u_parts, $v_parts) + ->setComputeString(true); + + // For word-level and character-level changes, smooth the output string + // to reduce the choppiness of the diff. + if ($level > 1) { + $matrix->setApplySmoothing(PhutilEditDistanceMatrix::SMOOTHING_FULL); + } + + $u_pos = 0; + $v_pos = 0; + + $edits = $matrix->getEditString(); + $edits_length = strlen($edits); + + $diff = new PhutilProseDiff(); + for ($ii = 0; $ii < $edits_length; $ii++) { + $c = $edits[$ii]; + if ($c == 's') { + $diff->addPart('=', $u_parts[$u_pos]); + $u_pos++; + $v_pos++; + } else if ($c == 'd') { + $diff->addPart('-', $u_parts[$u_pos]); + $u_pos++; + } else if ($c == 'i') { + $diff->addPart('+', $v_parts[$v_pos]); + $v_pos++; + } else if ($c == 'x') { + $diff->addPart('-', $u_parts[$u_pos]); + $diff->addPart('+', $v_parts[$v_pos]); + $u_pos++; + $v_pos++; + } else { + throw new Exception( + pht( + 'Unexpected character ("%s") in edit string.', + $c)); + } + } + + $diff->reorderParts(); + + // If we just built a character-level diff, we're all done and do not + // need to go any deeper. + if ($level == 3) { + return $diff; + } + + $blocks = array(); + $block = null; + foreach ($diff->getParts() as $part) { + $type = $part['type']; + $text = $part['text']; + switch ($type) { + case '=': + if ($block) { + $blocks[] = $block; + $block = null; + } + $blocks[] = array( + 'type' => $type, + 'text' => $text, + ); + break; + case '-': + if (!$block) { + $block = array( + 'type' => '!', + 'old' => '', + 'new' => '', + ); + } + $block['old'] .= $text; + break; + case '+': + if (!$block) { + $block = array( + 'type' => '!', + 'old' => '', + 'new' => '', + ); + } + $block['new'] .= $text; + break; + } + } + + if ($block) { + $blocks[] = $block; + } + + $result = new PhutilProseDiff(); + foreach ($blocks as $block) { + $type = $block['type']; + if ($type == '=') { + $result->addPart('=', $block['text']); + } else { + $old = $block['old']; + $new = $block['new']; + if (!strlen($old) && !strlen($new)) { + // Nothing to do. + } else if (!strlen($old)) { + $result->addPart('+', $new); + } else if (!strlen($new)) { + $result->addPart('-', $old); + } else { + if ($matrix->didReachMaximumLength()) { + // If this text was too big to diff, don't try to subdivide it. + $result->addPart('-', $old); + $result->addPart('+', $new); + } else { + $subdiff = $this->buildDiff( + $old, + $new, + $level + 1); + + foreach ($subdiff->getParts() as $part) { + $result->addPart($part['type'], $part['text']); + } + } + } + } + } + + $result->reorderParts(); + + return $result; + } + + private function splitCorpus($corpus, $level) { + switch ($level) { + case 0: + // Level 0: Split into paragraphs. + $expr = '/([\n]+)/'; + break; + case 1: + // Level 1: Split into sentences. + $expr = '/([\n,!;?\.]+)/'; + break; + case 2: + // Level 2: Split into words. + $expr = '/(\s+)/'; + break; + case 3: + // Level 3: Split into characters. + return phutil_utf8v_combined($corpus); + } + + $pieces = preg_split($expr, $corpus, -1, PREG_SPLIT_DELIM_CAPTURE); + return $this->stitchPieces($pieces, $level); + } + + private function stitchPieces(array $pieces, $level) { + $results = array(); + $count = count($pieces); + for ($ii = 0; $ii < $count; $ii += 2) { + $result = $pieces[$ii]; + if ($ii + 1 < $count) { + $result .= $pieces[$ii + 1]; + } + + if ($level < 2) { + // Split pieces into separate text and whitespace sections: make one + // piece out of all the whitespace at the beginning, one piece out of + // all the actual text in the middle, and one piece out of all the + // whitespace at the end. + + $matches = null; + preg_match('/^(\s*)(.*?)(\s*)\z/', $result, $matches); + + if (strlen($matches[1])) { + $results[] = $matches[1]; + } + if (strlen($matches[2])) { + $results[] = $matches[2]; + } + if (strlen($matches[3])) { + $results[] = $matches[3]; + } + } else { + $results[] = $result; + } + } + + // If the input ended with a delimiter, we can get an empty final piece. + // Just discard it. + if (last($results) == '') { + array_pop($results); + } + + return $results; + } + +} diff --git a/src/infrastructure/diff/prose/__tests__/PhutilProseDiffTestCase.php b/src/infrastructure/diff/prose/__tests__/PhutilProseDiffTestCase.php new file mode 100644 --- /dev/null +++ b/src/infrastructure/diff/prose/__tests__/PhutilProseDiffTestCase.php @@ -0,0 +1,245 @@ +assertProseParts( + '', + '', + array(), + pht('Empty')); + + $this->assertProseParts( + "xxx\nyyy", + "xxx\nzzz\nyyy", + array( + "= xxx\n", + "+ zzz\n", + '= yyy', + ), + pht('Add Paragraph')); + + $this->assertProseParts( + "xxx\nzzz\nyyy", + "xxx\nyyy", + array( + "= xxx\n", + "- zzz\n", + '= yyy', + ), + pht('Remove Paragraph')); + + + // Without smoothing, the alogorithm identifies that "shark" and "cat" + // both contain the letter "a" and tries to express this as a very + // fine-grained edit which replaces "sh" with "c" and then "rk" with "t". + // This is technically correct, but it is much easier for human viewers to + // parse if we smooth this into a single removal and a single addition. + + $this->assertProseParts( + 'They say the shark has nine lives.', + 'They say the cat has nine lives.', + array( + '= They say the ', + '- shark', + '+ cat', + '= has nine lives.', + ), + pht('"Shark/cat" word edit smoothenss.')); + + $this->assertProseParts( + 'Rising quickly, she says', + 'Rising quickly, she remarks:', + array( + '= Rising quickly, she ', + '- says', + '+ remarks:', + ), + pht('"Says/remarks" word edit smoothenss.')); + + $this->assertProseParts( + 'See screenshots', + 'Viewed video files', + array( + '- See screenshots', + '+ Viewed video files', + ), + pht('Complete paragraph rewrite.')); + + $this->assertProseParts( + 'xaaax', + 'xbbbx', + array( + '- xaaax', + '+ xbbbx', + ), + pht('Whole word rewrite with common prefix and suffix.')); + + $this->assertProseParts( + ' aaa ', + ' bbb ', + array( + '= ', + '- aaa', + '+ bbb', + '= ', + ), + pht('Whole word rewrite with whitespace prefix and suffix.')); + + $this->assertSummaryProseParts( + "a\nb\nc\nd\ne\nf\ng\nh\n", + "a\nb\nc\nd\nX\nf\ng\nh\n", + array( + '.', + "= d\n", + '- e', + '+ X', + "= \nf", + '.', + ), + pht('Summary diff with middle change.')); + + $this->assertSummaryProseParts( + "a\nb\nc\nd\ne\nf\ng\nh\n", + "X\nb\nc\nd\ne\nf\ng\nh\n", + array( + '- a', + '+ X', + "= \nb", + '.', + ), + pht('Summary diff with head change.')); + + $this->assertSummaryProseParts( + "a\nb\nc\nd\ne\nf\ng\nh\n", + "a\nb\nc\nd\ne\nf\ng\nX\n", + array( + '.', + "= g\n", + '- h', + '+ X', + "= \n", + ), + pht('Summary diff with last change.')); + + $this->assertProseParts( + 'aaa aaa aaa aaa, bbb bbb bbb bbb.', + "aaa aaa aaa aaa, bbb bbb bbb bbb.\n\n- ccc ccc ccc", + array( + '= aaa aaa aaa aaa, bbb bbb bbb bbb.', + "+ \n\n- ccc ccc ccc", + ), + pht('Diff with new trailing content.')); + + $this->assertProseParts( + 'aaa aaa aaa aaa, bbb bbb bbb bbb.', + 'aaa aaa aaa aaa bbb bbb bbb bbb.', + array( + '= aaa aaa aaa aaa', + '- ,', + '= bbb bbb bbb bbb.', + ), + pht('Diff with a removed comma.')); + + $this->assertProseParts( + 'aaa aaa aaa aaa, bbb bbb bbb bbb.', + "aaa aaa aaa aaa bbb bbb bbb bbb.\n\n- ccc ccc ccc!", + array( + '= aaa aaa aaa aaa', + '- ,', + '= bbb bbb bbb bbb.', + "+ \n\n- ccc ccc ccc!", + ), + pht('Diff with a removed comma and new trailing content.')); + + $this->assertProseParts( + '[ ] Walnuts', + '[X] Walnuts', + array( + '= [', + '- ', + '+ X', + '= ] Walnuts', + ), + pht('Diff adding a tickmark to a checkbox list.')); + + $this->assertProseParts( + '[[ ./week49 ]]', + '[[ ./week50 ]]', + array( + '= [[ ./week', + '- 49', + '+ 50', + '= ]]', + ), + pht('Diff changing a remarkup wiki link target.')); + + // Create a large corpus with many sentences and paragraphs. + $large_paragraph = 'xyz. '; + $large_paragraph = str_repeat($large_paragraph, 50); + $large_paragraph = rtrim($large_paragraph); + + $large_corpus = $large_paragraph."\n\n"; + $large_corpus = str_repeat($large_corpus, 50); + $large_corpus = rtrim($large_corpus); + + $this->assertProseParts( + $large_corpus, + "aaa\n\n".$large_corpus."\n\nzzz", + array( + "+ aaa\n\n", + '= '.$large_corpus, + "+ \n\nzzz", + ), + pht('Adding initial and final lines to a large corpus.')); + + } + + private function assertProseParts($old, $new, array $expect_parts, $label) { + $engine = new PhutilProseDifferenceEngine(); + $diff = $engine->getDiff($old, $new); + + $parts = $diff->getParts(); + + $this->assertParts($expect_parts, $parts, $label); + } + + private function assertSummaryProseParts( + $old, + $new, + array $expect_parts, + $label) { + + $engine = new PhutilProseDifferenceEngine(); + $diff = $engine->getDiff($old, $new); + + $parts = $diff->getSummaryParts(); + + $this->assertParts($expect_parts, $parts, $label); + } + + private function assertParts( + array $expect, + array $actual_parts, + $label) { + + $actual = array(); + foreach ($actual_parts as $actual_part) { + $type = $actual_part['type']; + $text = $actual_part['text']; + + switch ($type) { + case '.': + $actual[] = $type; + break; + default: + $actual[] = "{$type} {$text}"; + break; + } + } + + $this->assertEqual($expect, $actual, $label); + } + + +}