Page MenuHomePhabricator

D20837.id49668.diff
No OneTemporary

D20837.id49668.diff

diff --git a/scripts/utils/prosediff.php b/scripts/utils/prosediff.php
deleted file mode 100755
--- a/scripts/utils/prosediff.php
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/usr/bin/env php
-<?php
-
-require_once dirname(__FILE__).'/../__init_script__.php';
-
-$args = new PhutilArgumentParser($argv);
-$args->setTagline(pht('show prose differences'));
-$args->setSynopsis(<<<EOHELP
-**prosediff.php** __old__ __new__ [__options__]
- Diff two prose files.
-
-EOHELP
-);
-
-$args->parseStandardArguments();
-$args->parse(
- array(
- array(
- 'name' => 'files',
- 'wildcard' => true,
- ),
- ));
-
-$files = $args->getArg('files');
-if (count($files) !== 2) {
- $args->printHelpAndExit();
-}
-$old_file = head($files);
-$new_file = last($files);
-
-$old_data = Filesystem::readFile($old_file);
-$new_data = Filesystem::readFile($new_file);
-
-$engine = new PhutilProseDifferenceEngine();
-
-$prose_diff = $engine->getDiff($old_data, $new_data);
-
-foreach ($prose_diff->getParts() as $part) {
- switch ($part['type']) {
- case '-':
- echo tsprintf('<bg:red>%B</bg>', $part['text']);
- break;
- case '+':
- echo tsprintf('<bg:green>%B</bg>', $part['text']);
- break;
- case '=':
- echo tsprintf('%B', $part['text']);
- break;
- }
-}
diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php
--- a/src/__phutil_library_map__.php
+++ b/src/__phutil_library_map__.php
@@ -297,9 +297,6 @@
'PhutilProcessRef' => 'filesystem/PhutilProcessRef.php',
'PhutilProcessRefTestCase' => 'filesystem/__tests__/PhutilProcessRefTestCase.php',
'PhutilProgressSink' => 'progress/PhutilProgressSink.php',
- 'PhutilProseDiff' => 'utils/PhutilProseDiff.php',
- 'PhutilProseDiffTestCase' => 'utils/__tests__/PhutilProseDiffTestCase.php',
- 'PhutilProseDifferenceEngine' => 'utils/PhutilProseDifferenceEngine.php',
'PhutilProtocolChannel' => 'channel/PhutilProtocolChannel.php',
'PhutilProxyException' => 'error/PhutilProxyException.php',
'PhutilProxyIterator' => 'utils/PhutilProxyIterator.php',
@@ -841,9 +838,6 @@
'PhutilProcessRef' => 'Phobject',
'PhutilProcessRefTestCase' => 'PhutilTestCase',
'PhutilProgressSink' => 'Phobject',
- 'PhutilProseDiff' => 'Phobject',
- 'PhutilProseDiffTestCase' => 'PhutilTestCase',
- 'PhutilProseDifferenceEngine' => 'Phobject',
'PhutilProtocolChannel' => 'PhutilChannelChannel',
'PhutilProxyException' => 'Exception',
'PhutilProxyIterator' => array(
diff --git a/src/utils/PhutilProseDiff.php b/src/utils/PhutilProseDiff.php
deleted file mode 100644
--- a/src/utils/PhutilProseDiff.php
+++ /dev/null
@@ -1,292 +0,0 @@
-<?php
-
-final class PhutilProseDiff extends Phobject {
-
- private $parts = array();
-
- public function addPart($type, $text) {
- $this->parts[] = array(
- 'type' => $type,
- 'text' => $text,
- );
- return $this;
- }
-
- public function getParts() {
- return $this->parts;
- }
-
- /**
- * Get diff parts, but replace large blocks of unchanged text with "."
- * parts representing missing context.
- */
- public function getSummaryParts() {
- $parts = $this->getParts();
-
- $head_key = head_key($parts);
- $last_key = last_key($parts);
-
- $results = array();
- foreach ($parts as $key => $part) {
- $is_head = ($key == $head_key);
- $is_last = ($key == $last_key);
-
- switch ($part['type']) {
- case '=':
- $pieces = $this->splitTextForSummary($part['text']);
-
- if ($is_head || $is_last) {
- $need = 2;
- } else {
- $need = 3;
- }
-
- // We don't have enough pieces to omit anything, so just continue.
- if (count($pieces) < $need) {
- $results[] = $part;
- break;
- }
-
- if (!$is_head) {
- $results[] = array(
- 'type' => '=',
- 'text' => head($pieces),
- );
- }
-
- $results[] = array(
- 'type' => '.',
- 'text' => null,
- );
-
- if (!$is_last) {
- $results[] = array(
- 'type' => '=',
- 'text' => last($pieces),
- );
- }
- break;
- default:
- $results[] = $part;
- break;
- }
- }
-
- return $results;
- }
-
-
- public function reorderParts() {
- // Reorder sequences of removed and added sections to put all the "-"
- // parts together first, then all the "+" parts together. This produces
- // a more human-readable result than intermingling them.
-
- $o_run = array();
- $n_run = array();
- $result = array();
- foreach ($this->parts as $part) {
- $type = $part['type'];
- switch ($type) {
- case '-':
- $o_run[] = $part;
- break;
- case '+':
- $n_run[] = $part;
- break;
- default:
- if ($o_run || $n_run) {
- foreach ($this->combineRuns($o_run, $n_run) as $merged_part) {
- $result[] = $merged_part;
- }
- $o_run = array();
- $n_run = array();
- }
- $result[] = $part;
- break;
- }
- }
-
- if ($o_run || $n_run) {
- foreach ($this->combineRuns($o_run, $n_run) as $part) {
- $result[] = $part;
- }
- }
-
- // Now, combine consecuitive runs of the same type of change (like a
- // series of "-" parts) into a single run.
- $combined = array();
-
- $last = null;
- $last_text = null;
- foreach ($result as $part) {
- $type = $part['type'];
-
- if ($last !== $type) {
- if ($last !== null) {
- $combined[] = array(
- 'type' => $last,
- 'text' => $last_text,
- );
- }
- $last_text = null;
- $last = $type;
- }
-
- $last_text .= $part['text'];
- }
-
- if ($last_text !== null) {
- $combined[] = array(
- 'type' => $last,
- 'text' => $last_text,
- );
- }
-
- $this->parts = $combined;
-
- return $this;
- }
-
- private function combineRuns($o_run, $n_run) {
- $o_merge = $this->mergeParts($o_run);
- $n_merge = $this->mergeParts($n_run);
-
- // When removed and added blocks share a prefix or suffix, we sometimes
- // want to count it as unchanged (for example, if it is whitespace) but
- // sometimes want to count it as changed (for example, if it is a word
- // suffix like "ing"). Find common prefixes and suffixes of these layout
- // characters and emit them as "=" (unchanged) blocks.
-
- $layout_characters = array(
- ' ' => true,
- "\n" => true,
- '.' => true,
- '!' => true,
- ',' => true,
- '?' => true,
- ']' => true,
- '[' => true,
- '(' => true,
- ')' => true,
- '<' => true,
- '>' => true,
- );
-
- $o_text = $o_merge['text'];
- $n_text = $n_merge['text'];
- $o_len = strlen($o_text);
- $n_len = strlen($n_text);
- $min_len = min($o_len, $n_len);
-
- $prefix_len = 0;
- for ($pos = 0; $pos < $min_len; $pos++) {
- $o = $o_text[$pos];
- $n = $n_text[$pos];
- if ($o !== $n) {
- break;
- }
- if (empty($layout_characters[$o])) {
- break;
- }
- $prefix_len++;
- }
-
- $suffix_len = 0;
- for ($pos = 0; $pos < ($min_len - $prefix_len); $pos++) {
- $o = $o_text[$o_len - ($pos + 1)];
- $n = $n_text[$n_len - ($pos + 1)];
- if ($o !== $n) {
- break;
- }
- if (empty($layout_characters[$o])) {
- break;
- }
- $suffix_len++;
- }
-
- $results = array();
-
- if ($prefix_len) {
- $results[] = array(
- 'type' => '=',
- 'text' => substr($o_text, 0, $prefix_len),
- );
- }
-
- if ($prefix_len < $o_len) {
- $results[] = array(
- 'type' => '-',
- 'text' => substr(
- $o_text,
- $prefix_len,
- $o_len - $prefix_len - $suffix_len),
- );
- }
-
- if ($prefix_len < $n_len) {
- $results[] = array(
- 'type' => '+',
- 'text' => substr(
- $n_text,
- $prefix_len,
- $n_len - $prefix_len - $suffix_len),
- );
- }
-
- if ($suffix_len) {
- $results[] = array(
- 'type' => '=',
- 'text' => substr($o_text, -$suffix_len),
- );
- }
-
- return $results;
- }
-
- private function mergeParts(array $parts) {
- $text = '';
- $type = null;
- foreach ($parts as $part) {
- $part_type = $part['type'];
- if ($type === null) {
- $type = $part_type;
- }
- if ($type !== $part_type) {
- throw new Exception(pht('Can not merge parts of dissimilar types!'));
- }
- $text .= $part['text'];
- }
-
- return array(
- 'type' => $type,
- 'text' => $text,
- );
- }
-
- private function splitTextForSummary($text) {
- $matches = null;
-
- $ok = preg_match('/^(\n*[^\n]+)\n/', $text, $matches);
- if (!$ok) {
- return array($text);
- }
-
- $head = $matches[1];
- $text = substr($text, strlen($head));
-
- $ok = preg_match('/\n([^\n]+\n*)\z/', $text, $matches);
- if (!$ok) {
- return array($text);
- }
-
- $last = $matches[1];
- $text = substr($text, 0, -strlen($last));
-
- if (!strlen(trim($text))) {
- return array($head, $last);
- } else {
- return array($head, $text, $last);
- }
- }
-
-}
diff --git a/src/utils/PhutilProseDifferenceEngine.php b/src/utils/PhutilProseDifferenceEngine.php
deleted file mode 100644
--- a/src/utils/PhutilProseDifferenceEngine.php
+++ /dev/null
@@ -1,209 +0,0 @@
-<?php
-
-final class PhutilProseDifferenceEngine extends Phobject {
-
- public function getDiff($u, $v) {
- return $this->buildDiff($u, $v, 0);
- }
-
- private function buildDiff($u, $v, $level) {
- $u_parts = $this->splitCorpus($u, $level);
- $v_parts = $this->splitCorpus($v, $level);
-
- $matrix = id(new PhutilEditDistanceMatrix())
- ->setMaximumLength(128)
- ->setSequences($u_parts, $v_parts)
- ->setComputeString(true);
-
- // For word-level and character-level changes, smooth the output string
- // to reduce the choppiness of the diff.
- if ($level > 1) {
- $matrix->setApplySmoothing(PhutilEditDistanceMatrix::SMOOTHING_FULL);
- }
-
- $u_pos = 0;
- $v_pos = 0;
-
- $edits = $matrix->getEditString();
- $edits_length = strlen($edits);
-
- $diff = new PhutilProseDiff();
- for ($ii = 0; $ii < $edits_length; $ii++) {
- $c = $edits[$ii];
- if ($c == 's') {
- $diff->addPart('=', $u_parts[$u_pos]);
- $u_pos++;
- $v_pos++;
- } else if ($c == 'd') {
- $diff->addPart('-', $u_parts[$u_pos]);
- $u_pos++;
- } else if ($c == 'i') {
- $diff->addPart('+', $v_parts[$v_pos]);
- $v_pos++;
- } else if ($c == 'x') {
- $diff->addPart('-', $u_parts[$u_pos]);
- $diff->addPart('+', $v_parts[$v_pos]);
- $u_pos++;
- $v_pos++;
- } else {
- throw new Exception(
- pht(
- 'Unexpected character ("%s") in edit string.',
- $c));
- }
- }
-
- $diff->reorderParts();
-
- // If we just built a character-level diff, we're all done and do not
- // need to go any deeper.
- if ($level == 3) {
- return $diff;
- }
-
- $blocks = array();
- $block = null;
- foreach ($diff->getParts() as $part) {
- $type = $part['type'];
- $text = $part['text'];
- switch ($type) {
- case '=':
- if ($block) {
- $blocks[] = $block;
- $block = null;
- }
- $blocks[] = array(
- 'type' => $type,
- 'text' => $text,
- );
- break;
- case '-':
- if (!$block) {
- $block = array(
- 'type' => '!',
- 'old' => '',
- 'new' => '',
- );
- }
- $block['old'] .= $text;
- break;
- case '+':
- if (!$block) {
- $block = array(
- 'type' => '!',
- 'old' => '',
- 'new' => '',
- );
- }
- $block['new'] .= $text;
- break;
- }
- }
-
- if ($block) {
- $blocks[] = $block;
- }
-
- $result = new PhutilProseDiff();
- foreach ($blocks as $block) {
- $type = $block['type'];
- if ($type == '=') {
- $result->addPart('=', $block['text']);
- } else {
- $old = $block['old'];
- $new = $block['new'];
- if (!strlen($old) && !strlen($new)) {
- // Nothing to do.
- } else if (!strlen($old)) {
- $result->addPart('+', $new);
- } else if (!strlen($new)) {
- $result->addPart('-', $old);
- } else {
- if ($matrix->didReachMaximumLength()) {
- // If this text was too big to diff, don't try to subdivide it.
- $result->addPart('-', $old);
- $result->addPart('+', $new);
- } else {
- $subdiff = $this->buildDiff(
- $old,
- $new,
- $level + 1);
-
- foreach ($subdiff->getParts() as $part) {
- $result->addPart($part['type'], $part['text']);
- }
- }
- }
- }
- }
-
- $result->reorderParts();
-
- return $result;
- }
-
- private function splitCorpus($corpus, $level) {
- switch ($level) {
- case 0:
- // Level 0: Split into paragraphs.
- $expr = '/([\n]+)/';
- break;
- case 1:
- // Level 1: Split into sentences.
- $expr = '/([\n,!;?\.]+)/';
- break;
- case 2:
- // Level 2: Split into words.
- $expr = '/(\s+)/';
- break;
- case 3:
- // Level 3: Split into characters.
- return phutil_utf8v_combined($corpus);
- }
-
- $pieces = preg_split($expr, $corpus, -1, PREG_SPLIT_DELIM_CAPTURE);
- return $this->stitchPieces($pieces, $level);
- }
-
- private function stitchPieces(array $pieces, $level) {
- $results = array();
- $count = count($pieces);
- for ($ii = 0; $ii < $count; $ii += 2) {
- $result = $pieces[$ii];
- if ($ii + 1 < $count) {
- $result .= $pieces[$ii + 1];
- }
-
- if ($level < 2) {
- // Split pieces into separate text and whitespace sections: make one
- // piece out of all the whitespace at the beginning, one piece out of
- // all the actual text in the middle, and one piece out of all the
- // whitespace at the end.
-
- $matches = null;
- preg_match('/^(\s*)(.*?)(\s*)\z/', $result, $matches);
-
- if (strlen($matches[1])) {
- $results[] = $matches[1];
- }
- if (strlen($matches[2])) {
- $results[] = $matches[2];
- }
- if (strlen($matches[3])) {
- $results[] = $matches[3];
- }
- } else {
- $results[] = $result;
- }
- }
-
- // If the input ended with a delimiter, we can get an empty final piece.
- // Just discard it.
- if (last($results) == '') {
- array_pop($results);
- }
-
- return $results;
- }
-
-}
diff --git a/src/utils/__tests__/PhutilProseDiffTestCase.php b/src/utils/__tests__/PhutilProseDiffTestCase.php
deleted file mode 100644
--- a/src/utils/__tests__/PhutilProseDiffTestCase.php
+++ /dev/null
@@ -1,245 +0,0 @@
-<?php
-
-final class PhutilProseDiffTestCase extends PhutilTestCase {
-
- public function testProseDiffsDistance() {
- $this->assertProseParts(
- '',
- '',
- array(),
- pht('Empty'));
-
- $this->assertProseParts(
- "xxx\nyyy",
- "xxx\nzzz\nyyy",
- array(
- "= xxx\n",
- "+ zzz\n",
- '= yyy',
- ),
- pht('Add Paragraph'));
-
- $this->assertProseParts(
- "xxx\nzzz\nyyy",
- "xxx\nyyy",
- array(
- "= xxx\n",
- "- zzz\n",
- '= yyy',
- ),
- pht('Remove Paragraph'));
-
-
- // Without smoothing, the alogorithm identifies that "shark" and "cat"
- // both contain the letter "a" and tries to express this as a very
- // fine-grained edit which replaces "sh" with "c" and then "rk" with "t".
- // This is technically correct, but it is much easier for human viewers to
- // parse if we smooth this into a single removal and a single addition.
-
- $this->assertProseParts(
- 'They say the shark has nine lives.',
- 'They say the cat has nine lives.',
- array(
- '= They say the ',
- '- shark',
- '+ cat',
- '= has nine lives.',
- ),
- pht('"Shark/cat" word edit smoothenss.'));
-
- $this->assertProseParts(
- 'Rising quickly, she says',
- 'Rising quickly, she remarks:',
- array(
- '= Rising quickly, she ',
- '- says',
- '+ remarks:',
- ),
- pht('"Says/remarks" word edit smoothenss.'));
-
- $this->assertProseParts(
- 'See screenshots',
- 'Viewed video files',
- array(
- '- See screenshots',
- '+ Viewed video files',
- ),
- pht('Complete paragraph rewrite.'));
-
- $this->assertProseParts(
- 'xaaax',
- 'xbbbx',
- array(
- '- xaaax',
- '+ xbbbx',
- ),
- pht('Whole word rewrite with common prefix and suffix.'));
-
- $this->assertProseParts(
- ' aaa ',
- ' bbb ',
- array(
- '= ',
- '- aaa',
- '+ bbb',
- '= ',
- ),
- pht('Whole word rewrite with whitespace prefix and suffix.'));
-
- $this->assertSummaryProseParts(
- "a\nb\nc\nd\ne\nf\ng\nh\n",
- "a\nb\nc\nd\nX\nf\ng\nh\n",
- array(
- '.',
- "= d\n",
- '- e',
- '+ X',
- "= \nf",
- '.',
- ),
- pht('Summary diff with middle change.'));
-
- $this->assertSummaryProseParts(
- "a\nb\nc\nd\ne\nf\ng\nh\n",
- "X\nb\nc\nd\ne\nf\ng\nh\n",
- array(
- '- a',
- '+ X',
- "= \nb",
- '.',
- ),
- pht('Summary diff with head change.'));
-
- $this->assertSummaryProseParts(
- "a\nb\nc\nd\ne\nf\ng\nh\n",
- "a\nb\nc\nd\ne\nf\ng\nX\n",
- array(
- '.',
- "= g\n",
- '- h',
- '+ X',
- "= \n",
- ),
- pht('Summary diff with last change.'));
-
- $this->assertProseParts(
- 'aaa aaa aaa aaa, bbb bbb bbb bbb.',
- "aaa aaa aaa aaa, bbb bbb bbb bbb.\n\n- ccc ccc ccc",
- array(
- '= aaa aaa aaa aaa, bbb bbb bbb bbb.',
- "+ \n\n- ccc ccc ccc",
- ),
- pht('Diff with new trailing content.'));
-
- $this->assertProseParts(
- 'aaa aaa aaa aaa, bbb bbb bbb bbb.',
- 'aaa aaa aaa aaa bbb bbb bbb bbb.',
- array(
- '= aaa aaa aaa aaa',
- '- ,',
- '= bbb bbb bbb bbb.',
- ),
- pht('Diff with a removed comma.'));
-
- $this->assertProseParts(
- 'aaa aaa aaa aaa, bbb bbb bbb bbb.',
- "aaa aaa aaa aaa bbb bbb bbb bbb.\n\n- ccc ccc ccc!",
- array(
- '= aaa aaa aaa aaa',
- '- ,',
- '= bbb bbb bbb bbb.',
- "+ \n\n- ccc ccc ccc!",
- ),
- pht('Diff with a removed comma and new trailing content.'));
-
- $this->assertProseParts(
- '[ ] Walnuts',
- '[X] Walnuts',
- array(
- '= [',
- '- ',
- '+ X',
- '= ] Walnuts',
- ),
- pht('Diff adding a tickmark to a checkbox list.'));
-
- $this->assertProseParts(
- '[[ ./week49 ]]',
- '[[ ./week50 ]]',
- array(
- '= [[ ./week',
- '- 49',
- '+ 50',
- '= ]]',
- ),
- pht('Diff changing a remarkup wiki link target.'));
-
- // Create a large corpus with many sentences and paragraphs.
- $large_paragraph = 'xyz. ';
- $large_paragraph = str_repeat($large_paragraph, 50);
- $large_paragraph = rtrim($large_paragraph);
-
- $large_corpus = $large_paragraph."\n\n";
- $large_corpus = str_repeat($large_corpus, 50);
- $large_corpus = rtrim($large_corpus);
-
- $this->assertProseParts(
- $large_corpus,
- "aaa\n\n".$large_corpus."\n\nzzz",
- array(
- "+ aaa\n\n",
- '= '.$large_corpus,
- "+ \n\nzzz",
- ),
- pht('Adding initial and final lines to a large corpus.'));
-
- }
-
- private function assertProseParts($old, $new, array $expect_parts, $label) {
- $engine = new PhutilProseDifferenceEngine();
- $diff = $engine->getDiff($old, $new);
-
- $parts = $diff->getParts();
-
- $this->assertParts($expect_parts, $parts, $label);
- }
-
- private function assertSummaryProseParts(
- $old,
- $new,
- array $expect_parts,
- $label) {
-
- $engine = new PhutilProseDifferenceEngine();
- $diff = $engine->getDiff($old, $new);
-
- $parts = $diff->getSummaryParts();
-
- $this->assertParts($expect_parts, $parts, $label);
- }
-
- private function assertParts(
- array $expect,
- array $actual_parts,
- $label) {
-
- $actual = array();
- foreach ($actual_parts as $actual_part) {
- $type = $actual_part['type'];
- $text = $actual_part['text'];
-
- switch ($type) {
- case '.':
- $actual[] = $type;
- break;
- default:
- $actual[] = "{$type} {$text}";
- break;
- }
- }
-
- $this->assertEqual($expect, $actual, $label);
- }
-
-
-}

File Metadata

Mime Type
text/plain
Expires
Fri, Mar 21, 10:40 AM (3 d, 23 h ago)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
7715373
Default Alt Text
D20837.id49668.diff (21 KB)

Event Timeline