diff --git a/externals/cldr/cldr_windows_timezones.xml b/externals/cldr/cldr_windows_timezones.xml new file mode 100644 index 0000000..47b689d --- /dev/null +++ b/externals/cldr/cldr_windows_timezones.xml @@ -0,0 +1,769 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/resources/timezones/windows_timezones.json b/resources/timezones/windows_timezones.json new file mode 100644 index 0000000..7a287b6 --- /dev/null +++ b/resources/timezones/windows_timezones.json @@ -0,0 +1,126 @@ +{ + "Egypt Standard Time": "Africa/Cairo", + "Morocco Standard Time": "Africa/Casablanca", + "South Africa Standard Time": "Africa/Johannesburg", + "W. Central Africa Standard Time": "Africa/Lagos", + "E. Africa Standard Time": "Africa/Nairobi", + "Libya Standard Time": "Africa/Tripoli", + "Namibia Standard Time": "Africa/Windhoek", + "Aleutian Standard Time": "America/Adak", + "Alaskan Standard Time": "America/Anchorage", + "Tocantins Standard Time": "America/Araguaina", + "Paraguay Standard Time": "America/Asuncion", + "Bahia Standard Time": "America/Bahia", + "SA Pacific Standard Time": "America/Bogota", + "Argentina Standard Time": "America/Buenos_Aires", + "Eastern Standard Time (Mexico)": "America/Cancun", + "Venezuela Standard Time": "America/Caracas", + "SA Eastern Standard Time": "America/Cayenne", + "Central Standard Time": "America/Chicago", + "Mountain Standard Time (Mexico)": "America/Chihuahua", + "Central Brazilian Standard Time": "America/Cuiaba", + "Mountain Standard Time": "America/Denver", + "Greenland Standard Time": "America/Godthab", + "Turks And Caicos Standard Time": "America/Grand_Turk", + "Central America Standard Time": "America/Guatemala", + "Atlantic Standard Time": "America/Halifax", + "Cuba Standard Time": "America/Havana", + "US Eastern Standard Time": "America/Indianapolis", + "SA Western Standard Time": "America/La_Paz", + "Pacific Standard Time": "America/Los_Angeles", + "Central Standard Time (Mexico)": "America/Mexico_City", + "Saint Pierre Standard Time": "America/Miquelon", + "Montevideo Standard Time": "America/Montevideo", + "Eastern Standard Time": "America/New_York", + "US Mountain Standard Time": "America/Phoenix", + "Haiti Standard Time": "America/Port-au-Prince", + "Canada Central Standard Time": "America/Regina", + "Pacific SA Standard Time": "America/Santiago", + "E. South America Standard Time": "America/Sao_Paulo", + "Newfoundland Standard Time": "America/St_Johns", + "Pacific Standard Time (Mexico)": "America/Tijuana", + "Central Asia Standard Time": "Asia/Almaty", + "Jordan Standard Time": "Asia/Amman", + "Arabic Standard Time": "Asia/Baghdad", + "Azerbaijan Standard Time": "Asia/Baku", + "SE Asia Standard Time": "Asia/Bangkok", + "Altai Standard Time": "Asia/Barnaul", + "Middle East Standard Time": "Asia/Beirut", + "India Standard Time": "Asia/Calcutta", + "Transbaikal Standard Time": "Asia/Chita", + "Sri Lanka Standard Time": "Asia/Colombo", + "Syria Standard Time": "Asia/Damascus", + "Bangladesh Standard Time": "Asia/Dhaka", + "Arabian Standard Time": "Asia/Dubai", + "West Bank Standard Time": "Asia/Hebron", + "W. Mongolia Standard Time": "Asia/Hovd", + "North Asia East Standard Time": "Asia/Irkutsk", + "Israel Standard Time": "Asia/Jerusalem", + "Afghanistan Standard Time": "Asia/Kabul", + "Russia Time Zone 11": "Asia/Kamchatka", + "Pakistan Standard Time": "Asia/Karachi", + "Nepal Standard Time": "Asia/Katmandu", + "North Asia Standard Time": "Asia/Krasnoyarsk", + "Magadan Standard Time": "Asia/Magadan", + "N. Central Asia Standard Time": "Asia/Novosibirsk", + "Omsk Standard Time": "Asia/Omsk", + "North Korea Standard Time": "Asia/Pyongyang", + "Myanmar Standard Time": "Asia/Rangoon", + "Arab Standard Time": "Asia/Riyadh", + "Sakhalin Standard Time": "Asia/Sakhalin", + "Korea Standard Time": "Asia/Seoul", + "China Standard Time": "Asia/Shanghai", + "Singapore Standard Time": "Asia/Singapore", + "Russia Time Zone 10": "Asia/Srednekolymsk", + "Taipei Standard Time": "Asia/Taipei", + "West Asia Standard Time": "Asia/Tashkent", + "Georgian Standard Time": "Asia/Tbilisi", + "Iran Standard Time": "Asia/Tehran", + "Tokyo Standard Time": "Asia/Tokyo", + "Tomsk Standard Time": "Asia/Tomsk", + "Ulaanbaatar Standard Time": "Asia/Ulaanbaatar", + "Vladivostok Standard Time": "Asia/Vladivostok", + "Yakutsk Standard Time": "Asia/Yakutsk", + "Ekaterinburg Standard Time": "Asia/Yekaterinburg", + "Caucasus Standard Time": "Asia/Yerevan", + "Azores Standard Time": "Atlantic/Azores", + "Cape Verde Standard Time": "Atlantic/Cape_Verde", + "Greenwich Standard Time": "Atlantic/Reykjavik", + "Cen. Australia Standard Time": "Australia/Adelaide", + "E. Australia Standard Time": "Australia/Brisbane", + "AUS Central Standard Time": "Australia/Darwin", + "Aus Central W. Standard Time": "Australia/Eucla", + "Tasmania Standard Time": "Australia/Hobart", + "Lord Howe Standard Time": "Australia/Lord_Howe", + "W. Australia Standard Time": "Australia/Perth", + "AUS Eastern Standard Time": "Australia/Sydney", + "Dateline Standard Time": "Etc/GMT+12", + "Astrakhan Standard Time": "Europe/Astrakhan", + "W. Europe Standard Time": "Europe/Berlin", + "GTB Standard Time": "Europe/Bucharest", + "Central Europe Standard Time": "Europe/Budapest", + "E. Europe Standard Time": "Europe/Chisinau", + "Turkey Standard Time": "Europe/Istanbul", + "Kaliningrad Standard Time": "Europe/Kaliningrad", + "FLE Standard Time": "Europe/Kiev", + "GMT Standard Time": "Europe/London", + "Belarus Standard Time": "Europe/Minsk", + "Russian Standard Time": "Europe/Moscow", + "Romance Standard Time": "Europe/Paris", + "Russia Time Zone 3": "Europe/Samara", + "Central European Standard Time": "Europe/Warsaw", + "Mauritius Standard Time": "Indian/Mauritius", + "Samoa Standard Time": "Pacific/Apia", + "New Zealand Standard Time": "Pacific/Auckland", + "Bougainville Standard Time": "Pacific/Bougainville", + "Chatham Islands Standard Time": "Pacific/Chatham", + "Easter Island Standard Time": "Pacific/Easter", + "Fiji Standard Time": "Pacific/Fiji", + "Central Pacific Standard Time": "Pacific/Guadalcanal", + "Hawaiian Standard Time": "Pacific/Honolulu", + "Line Islands Standard Time": "Pacific/Kiritimati", + "Marquesas Standard Time": "Pacific/Marquesas", + "Norfolk Standard Time": "Pacific/Norfolk", + "West Pacific Standard Time": "Pacific/Port_Moresby", + "Tonga Standard Time": "Pacific/Tongatapu" +} diff --git a/scripts/timezones/generate_windows_timezone_map.php b/scripts/timezones/generate_windows_timezone_map.php new file mode 100755 index 0000000..41b06be --- /dev/null +++ b/scripts/timezones/generate_windows_timezone_map.php @@ -0,0 +1,46 @@ +#!/usr/bin/env php +windowsZones->mapTimezones->mapZone; +foreach ($zones as $zone) { + $windows_name = (string)$zone['other']; + $target_name = (string)$zone['type']; + + // Ignore the offset-based timezones from the CLDR map, since we handle + // these later. + if (isset($ignore[$windows_name])) { + continue; + } + + // We've already seen this timezone so we don't need to add it to the map + // again. + if (isset($result_map[$windows_name])) { + continue; + } + + $result_map[$windows_name] = $target_name; +} + +asort($result_map); + +echo id(new PhutilJSON()) + ->encodeFormatted($result_map); diff --git a/src/parser/calendar/data/PhutilCalendarAbsoluteDateTime.php b/src/parser/calendar/data/PhutilCalendarAbsoluteDateTime.php index 4fbb2f6..0129e6c 100644 --- a/src/parser/calendar/data/PhutilCalendarAbsoluteDateTime.php +++ b/src/parser/calendar/data/PhutilCalendarAbsoluteDateTime.php @@ -1,275 +1,287 @@ \d{4})(?P\d{2})(?P\d{2})'. '(?:'. 'T(?P\d{2})(?P\d{2})(?P\d{2})(?Z)?'. ')?'. '\z/'; $matches = null; $ok = preg_match($pattern, $value, $matches); if (!$ok) { throw new Exception( pht( 'Expected ISO8601 datetime in the format "19990105T112233Z", '. 'found "%s".', $value)); } if (isset($matches['z'])) { if ($timezone != 'UTC') { throw new Exception( pht( 'ISO8601 date ends in "Z" indicating UTC, but a timezone other '. 'than UTC ("%s") was specified.', $timezone)); } } $datetime = id(new self()) ->setYear((int)$matches['y']) ->setMonth((int)$matches['m']) ->setDay((int)$matches['d']) ->setTimezone($timezone); if (isset($matches['h'])) { $datetime ->setHour((int)$matches['h']) ->setMinute((int)$matches['i']) ->setSecond((int)$matches['s']); } else { $datetime ->setIsAllDay(true); } return $datetime; } public static function newFromEpoch($epoch, $timezone = 'UTC') { $date = new DateTime('@'.$epoch); $zone = new DateTimeZone($timezone); $date->setTimezone($zone); return id(new self()) ->setYear((int)$date->format('Y')) ->setMonth((int)$date->format('m')) ->setDay((int)$date->format('d')) ->setHour((int)$date->format('H')) ->setMinute((int)$date->format('i')) ->setSecond((int)$date->format('s')) ->setTimezone($timezone); } public static function newFromDictionary(array $dict) { static $keys; if ($keys === null) { $keys = array_fuse( array( 'kind', 'year', 'month', 'day', 'hour', 'minute', 'second', 'timezone', 'isAllDay', )); } foreach ($dict as $key => $value) { if (!isset($keys[$key])) { throw new Exception( pht( 'Unexpected key "%s" in datetime dictionary, expected keys: %s.', $key, implode(', ', array_keys($keys)))); } } if (idx($dict, 'kind') !== 'absolute') { throw new Exception( pht( 'Expected key "%s" with value "%s" in datetime dictionary.', 'kind', 'absolute')); } if (!isset($dict['year'])) { throw new Exception( pht( 'Expected key "%s" in datetime dictionary.', 'year')); } $datetime = id(new self()) ->setYear(idx($dict, 'year')) ->setMonth(idx($dict, 'month', 1)) ->setDay(idx($dict, 'day', 1)) ->setHour(idx($dict, 'hour', 0)) ->setMinute(idx($dict, 'minute', 0)) ->setSecond(idx($dict, 'second', 0)) ->setTimezone(idx($dict, 'timezone')) ->setIsAllDay((bool)idx($dict, 'isAllDay', false)); return $datetime; } public function newRelativeDateTime($duration) { if (is_string($duration)) { $duration = PhutilCalendarDuration::newFromISO8601($duration); } if (!($duration instanceof PhutilCalendarDuration)) { throw new Exception( pht( 'Expected "PhutilCalendarDuration" object or ISO8601 duration '. 'string.')); } return id(new PhutilCalendarRelativeDateTime()) ->setOrigin($this) ->setDuration($duration); } public function toDictionary() { return array( 'kind' => 'absolute', 'year' => (int)$this->getYear(), 'month' => (int)$this->getMonth(), 'day' => (int)$this->getDay(), 'hour' => (int)$this->getHour(), 'minute' => (int)$this->getMinute(), 'second' => (int)$this->getSecond(), 'timezone' => $this->getTimezone(), 'isAllDay' => (bool)$this->getIsAllDay(), ); } public function setYear($year) { $this->year = $year; return $this; } public function getYear() { return $this->year; } public function setMonth($month) { $this->month = $month; return $this; } public function getMonth() { return $this->month; } public function setDay($day) { $this->day = $day; return $this; } public function getDay() { return $this->day; } public function setHour($hour) { $this->hour = $hour; return $this; } public function getHour() { return $this->hour; } public function setMinute($minute) { $this->minute = $minute; return $this; } public function getMinute() { return $this->minute; } public function setSecond($second) { $this->second = $second; return $this; } public function getSecond() { return $this->second; } public function setTimezone($timezone) { $this->timezone = $timezone; return $this; } public function getTimezone() { return $this->timezone; } private function getEffectiveTimezone() { - $zone = $this->getTimezone(); - if ($zone !== null) { - return $zone; + $date_timezone = $this->getTimezone(); + $viewer_timezone = $this->getViewerTimezone(); + + // Because all-day events are always "floating", the effective timezone + // is the viewer timezone if it is available. Otherwise, we'll return a + // DateTime object with the correct values, but it will be incorrectly + // adjusted forward or backward to the viewer's zone later. + + $zones = array(); + if ($this->getIsAllDay()) { + $zones[] = $viewer_timezone; + $zones[] = $date_timezone; + } else { + $zones[] = $date_timezone; + $zones[] = $viewer_timezone; } + $zones = array_filter($zones); - $zone = $this->getViewerTimezone(); - if ($zone !== null) { - return $zone; + if (!$zones) { + throw new Exception( + pht( + 'Datetime has no timezone or viewer timezone.')); } - throw new Exception( - pht( - 'Datetime has no timezone or viewer timezone.')); + return head($zones); } public function newPHPDateTimeZone() { $zone = $this->getEffectiveTimezone(); return new DateTimeZone($zone); } public function newPHPDateTime() { $zone = $this->newPHPDateTimeZone(); $y = $this->getYear(); $m = $this->getMonth(); $d = $this->getDay(); if ($this->getIsAllDay()) { $h = 0; $i = 0; $s = 0; } else { $h = $this->getHour(); $i = $this->getMinute(); $s = $this->getSecond(); } $format = sprintf('%04d-%02d-%02d %02d:%02d:%02d', $y, $m, $d, $h, $i, $s); return new DateTime($format, $zone); } public function newAbsoluteDateTime() { return clone $this; } } diff --git a/src/parser/calendar/ics/PhutilICSParser.php b/src/parser/calendar/ics/PhutilICSParser.php index e473cad..015bea6 100644 --- a/src/parser/calendar/ics/PhutilICSParser.php +++ b/src/parser/calendar/ics/PhutilICSParser.php @@ -1,907 +1,919 @@ stack = array(); $this->node = null; $this->cursor = null; $this->warnings = array(); $lines = $this->unfoldICSLines($data); $this->lines = $lines; $root = $this->newICSNode(''); $this->stack[] = $root; $this->node = $root; foreach ($lines as $key => $line) { $this->cursor = $key; $matches = null; if (preg_match('(^BEGIN:(.*)\z)', $line, $matches)) { $this->beginParsingNode($matches[1]); } else if (preg_match('(^END:(.*)\z)', $line, $matches)) { $this->endParsingNode($matches[1]); } else { if (count($this->stack) < 2) { $this->raiseParseFailure( self::PARSE_ROOT_PROPERTY, pht( 'Found unexpected property at ICS document root.')); } $this->parseICSProperty($line); } } if (count($this->stack) > 1) { $this->raiseParseFailure( self::PARSE_MISSING_END, pht( 'Expected all "BEGIN:" sections in ICS document to have '. 'corresponding "END:" sections.')); } $this->node = null; $this->lines = null; $this->cursor = null; return $root; } private function getNode() { return $this->node; } private function unfoldICSLines($data) { $lines = phutil_split_lines($data, $retain_endings = false); $this->lines = $lines; // ICS files are wrapped at 75 characters, with overlong lines continued // on the following line with an initial space or tab. Unwrap all of the // lines in the file. // This unwrapping is specifically byte-oriented, not character oriented, // and RFC5545 anticipates that simple implementations may even split UTF8 // characters in the middle. $last = null; foreach ($lines as $idx => $line) { $this->cursor = $idx; if (!preg_match('/^[ \t]/', $line)) { $last = $idx; continue; } if ($last === null) { $this->raiseParseFailure( self::PARSE_INITIAL_UNFOLD, pht( 'First line of ICS file begins with a space or tab, but this '. 'marks a line which should be unfolded.')); } $lines[$last] = $lines[$last].substr($line, 1); unset($lines[$idx]); } return $lines; } private function beginParsingNode($type) { $node = $this->getNode(); $new_node = $this->newICSNode($type); if ($node instanceof PhutilCalendarContainerNode) { $node->appendChild($new_node); } else { $this->raiseParseFailure( self::PARSE_UNEXPECTED_CHILD, pht( 'Found unexpected node "%s" inside node "%s".', $new_node->getAttribute('ics.type'), $node->getAttribute('ics.type'))); } $this->stack[] = $new_node; $this->node = $new_node; return $this; } private function newICSNode($type) { switch ($type) { case '': $node = new PhutilCalendarRootNode(); break; case 'VCALENDAR': $node = new PhutilCalendarDocumentNode(); break; case 'VEVENT': $node = new PhutilCalendarEventNode(); break; default: $node = new PhutilCalendarRawNode(); break; } $node->setAttribute('ics.type', $type); return $node; } private function endParsingNode($type) { $node = $this->getNode(); if ($node instanceof PhutilCalendarRootNode) { $this->raiseParseFailure( self::PARSE_EXTRA_END, pht( 'Found unexpected "END" without a "BEGIN".')); } $old_type = $node->getAttribute('ics.type'); if ($old_type != $type) { $this->raiseParseFailure( self::PARSE_MISMATCHED_SECTIONS, pht( 'Found mismatched "BEGIN" ("%s") and "END" ("%s") sections.', $old_type, $type)); } array_pop($this->stack); $this->node = last($this->stack); return $this; } private function parseICSProperty($line) { $matches = null; // Properties begin with an alphanumeric name with no escaping, followed // by either a ";" (to begin a list of parameters) or a ":" (to begin // the actual field body). $ok = preg_match('(^([A-Za-z0-9-]+)([;:])(.*)\z)', $line, $matches); if (!$ok) { $this->raiseParseFailure( self::PARSE_MALFORMED_PROPERTY, pht( 'Found malformed property in ICS document.')); } $name = $matches[1]; $body = $matches[3]; $has_parameters = ($matches[2] == ';'); $parameters = array(); if ($has_parameters) { // Parameters are a sensible name, a literal "=", a pile of magic, // and then maybe a comma and another parameter. while (true) { // We're going to get the first couple of parts first. $ok = preg_match('(^([^=]+)=)', $body, $matches); if (!$ok) { $this->raiseParseFailure( self::PARSE_MALFORMED_PARAMETER_NAME, pht( 'Found malformed property in ICS document: %s', $body)); } $param_name = $matches[1]; $body = substr($body, strlen($matches[0])); // Now we're going to match zero or more values. $param_values = array(); while (true) { // The value can either be a double-quoted string or an unquoted // string, with some characters forbidden. if (strlen($body) && $body[0] == '"') { $is_quoted = true; $ok = preg_match( '(^"([^\x00-\x08\x10-\x19"]*)")', $body, $matches); if (!$ok) { $this->raiseParseFailure( self::PARSE_MALFORMED_DOUBLE_QUOTE, pht( 'Found malformed double-quoted string in ICS document '. 'parameter value.')); } } else { $is_quoted = false; // It's impossible for this not to match since it can match // nothing, and it's valid for it to match nothing. preg_match('(^([^\x00-\x08\x10-\x19";:,]*))', $body, $matches); } // NOTE: RFC5545 says "Property parameter values that are not in // quoted-strings are case-insensitive." -- that is, the quoted and // unquoted representations are not equivalent. Thus, preserve the // original formatting in case we ever need to respect this. $param_values[] = array( 'value' => $this->unescapeParameterValue($matches[1]), 'quoted' => $is_quoted, ); $body = substr($body, strlen($matches[0])); if (!strlen($body)) { $this->raiseParseFailure( self::PARSE_MISSING_VALUE, pht( 'Expected ":" after parameters in ICS document property.')); } // If we have a comma now, we're going to read another value. Strip // it off and keep going. if ($body[0] == ',') { $body = substr($body, 1); continue; } // If we have a semicolon, we're going to read another parameter. if ($body[0] == ';') { break; } // If we have a colon, this is the last value and also the last // property. Break, then handle the colon below. if ($body[0] == ':') { break; } $short_body = id(new PhutilUTF8StringTruncator()) ->setMaximumGlyphs(32) ->truncateString($body); // We aren't expecting anything else. $this->raiseParseFailure( self::PARSE_UNEXPECTED_TEXT, pht( 'Found unexpected text ("%s") after reading parameter value.', $short_body)); } $parameters[] = array( 'name' => $param_name, 'values' => $param_values, ); if ($body[0] == ';') { $body = substr($body, 1); continue; } if ($body[0] == ':') { $body = substr($body, 1); break; } } } $value = $this->unescapeFieldValue($name, $parameters, $body); $node = $this->getNode(); $raw = $node->getAttribute('ics.properties', array()); $raw[] = array( 'name' => $name, 'parameters' => $parameters, 'value' => $value, ); $node->setAttribute('ics.properties', $raw); switch ($node->getAttribute('ics.type')) { case 'VEVENT': $this->didParseEventProperty($node, $name, $parameters, $value); break; } } private function unescapeParameterValue($data) { // The parameter grammar is adjusted by RFC6868 to permit escaping with // carets. Remove that escaping. // This escaping is a bit weird because it's trying to be backwards // compatible and the original spec didn't think about this and didn't // provide much room to fix things. $out = ''; $esc = false; foreach (phutil_utf8v($data) as $c) { if (!$esc) { if ($c != '^') { $out .= $c; } else { $esc = true; } } else { switch ($c) { case 'n': $out .= "\n"; break; case '^': $out .= '^'; break; case "'": // NOTE: This is " " being decoded into a // double quote! $out .= '"'; break; default: // NOTE: The caret is NOT an escape for any other characters. // This is a "MUST" requirement of RFC6868. $out .= '^'.$c; break; } } } // NOTE: Because caret on its own just means "caret" for backward // compatibility, we don't warn if we're still in escaped mode once we // reach the end of the string. return $out; } private function unescapeFieldValue($name, array $parameters, $data) { // NOTE: The encoding of the field value data is dependent on the field // name (which defines a default encoding) and the parameters (which may // include "VALUE", specifying a type of the data. $default_types = array( 'CALSCALE' => 'TEXT', 'METHOD' => 'TEXT', 'PRODID' => 'TEXT', 'VERSION' => 'TEXT', 'ATTACH' => 'URI', 'CATEGORIES' => 'TEXT', 'CLASS' => 'TEXT', 'COMMENT' => 'TEXT', 'DESCRIPTION' => 'TEXT', // TODO: The spec appears to contradict itself: it says that the value // type is FLOAT, but it also says that this property value is actually // two semicolon-separated values, which is not what FLOAT is defined as. 'GEO' => 'TEXT', 'LOCATION' => 'TEXT', 'PERCENT-COMPLETE' => 'INTEGER', 'PRIORITY' => 'INTEGER', 'RESOURCES' => 'TEXT', 'STATUS' => 'TEXT', 'SUMMARY' => 'TEXT', 'COMPLETED' => 'DATE-TIME', 'DTEND' => 'DATE-TIME', 'DUE' => 'DATE-TIME', 'DTSTART' => 'DATE-TIME', 'DURATION' => 'DURATION', 'FREEBUSY' => 'PERIOD', 'TRANSP' => 'TEXT', 'TZID' => 'TEXT', 'TZNAME' => 'TEXT', 'TZOFFSETFROM' => 'UTC-OFFSET', 'TZOFFSETTO' => 'UTC-OFFSET', 'TZURL' => 'URI', 'ATTENDEE' => 'CAL-ADDRESS', 'CONTACT' => 'TEXT', 'ORGANIZER' => 'CAL-ADDRESS', 'RECURRENCE-ID' => 'DATE-TIME', 'RELATED-TO' => 'TEXT', 'URL' => 'URI', 'UID' => 'TEXT', 'EXDATE' => 'DATE-TIME', 'RDATE' => 'DATE-TIME', 'RRULE' => 'RECUR', 'ACTION' => 'TEXT', 'REPEAT' => 'INTEGER', 'TRIGGER' => 'DURATION', 'CREATED' => 'DATE-TIME', 'DTSTAMP' => 'DATE-TIME', 'LAST-MODIFIED' => 'DATE-TIME', 'SEQUENCE' => 'INTEGER', 'REQUEST-STATUS' => 'TEXT', ); $value_type = idx($default_types, $name, 'TEXT'); foreach ($parameters as $parameter) { if ($parameter['name'] == 'VALUE') { $value_type = idx(head($parameter['values']), 'value'); } } switch ($value_type) { case 'BINARY': $result = base64_decode($data, true); if ($result === false) { $this->raiseParseFailure( self::PARSE_BAD_BASE64, pht( 'Unable to decode base64 data: %s', $data)); } break; case 'BOOLEAN': $map = array( 'true' => true, 'false' => false, ); $result = phutil_utf8_strtolower($data); if (!isset($map[$result])) { $this->raiseParseFailure( self::PARSE_BAD_BOOLEAN, pht( 'Unexpected BOOLEAN value "%s".', $data)); } $result = $map[$result]; break; case 'CAL-ADDRESS': $result = $data; break; case 'DATE': // This is a comma-separated list of "YYYYMMDD" values. $result = explode(',', $data); break; case 'DATE-TIME': if (!strlen($data)) { $result = array(); } else { $result = explode(',', $data); } break; case 'DURATION': if (!strlen($data)) { $result = array(); } else { $result = explode(',', $data); } break; case 'FLOAT': $result = explode(',', $data); foreach ($result as $k => $v) { $result[$k] = (float)$v; } break; case 'INTEGER': $result = explode(',', $data); foreach ($result as $k => $v) { $result[$k] = (int)$v; } break; case 'PERIOD': $result = explode(',', $data); break; case 'RECUR': $result = $data; break; case 'TEXT': $result = $this->unescapeTextValue($data); break; case 'TIME': $result = explode(',', $data); break; case 'URI': $result = $data; break; case 'UTC-OFFSET': $result = $data; break; default: // RFC5545 says we MUST preserve the data for any types we don't // recognize. $result = $data; break; } return array( 'type' => $value_type, 'value' => $result, 'raw' => $data, ); } private function unescapeTextValue($data) { $result = array(); $buf = ''; $esc = false; foreach (phutil_utf8v($data) as $c) { if (!$esc) { if ($c == '\\') { $esc = true; } else if ($c == ',') { $result[] = $buf; $buf = ''; } else { $buf .= $c; } } else { switch ($c) { case 'n': case 'N': $buf .= "\n"; break; default: $buf .= $c; break; } $esc = false; } } if ($esc) { $this->raiseParseFailure( self::PARSE_UNESCAPED_BACKSLASH, pht( 'ICS document contains TEXT value ending with unescaped '. 'backslash.')); } $result[] = $buf; return $result; } private function raiseParseFailure($code, $message) { if ($this->lines && isset($this->lines[$this->cursor])) { $message = pht( "ICS Parse Error near line %s:\n\n>>> %s\n\n%s", $this->cursor + 1, $this->lines[$this->cursor], $message); } else { $message = pht( 'ICS Parse Error: %s', $message); } throw id(new PhutilICSParserException($message)) ->setParserFailureCode($code); } private function raiseWarning($code, $message) { $this->warnings[] = array( 'code' => $code, 'line' => $this->cursor, 'text' => $this->lines[$this->cursor], 'message' => $message, ); return $this; } public function getWarnings() { return $this->warnings; } private function didParseEventProperty( PhutilCalendarEventNode $node, $name, array $parameters, array $value) { switch ($name) { case 'UID': $text = $this->newTextFromProperty($parameters, $value); $node->setUID($text); break; case 'CREATED': $datetime = $this->newDateTimeFromProperty($parameters, $value); $node->setCreatedDateTime($datetime); break; case 'DTSTAMP': $datetime = $this->newDateTimeFromProperty($parameters, $value); $node->setModifiedDateTime($datetime); break; case 'SUMMARY': $text = $this->newTextFromProperty($parameters, $value); $node->setName($text); break; case 'DESCRIPTION': $text = $this->newTextFromProperty($parameters, $value); $node->setDescription($text); break; case 'DTSTART': $datetime = $this->newDateTimeFromProperty($parameters, $value); $node->setStartDateTime($datetime); break; case 'DTEND': $datetime = $this->newDateTimeFromProperty($parameters, $value); $node->setEndDateTime($datetime); break; case 'DURATION': $duration = $this->newDurationFromProperty($parameters, $value); $node->setDuration($duration); break; case 'RRULE': $rrule = $this->newRecurrenceRuleFromProperty($parameters, $value); $node->setRecurrenceRule($rrule); break; case 'RECURRENCE-ID': $text = $this->newTextFromProperty($parameters, $value); $node->setRecurrenceID($text); break; case 'ATTENDEE': $attendee = $this->newAttendeeFromProperty($parameters, $value); $node->addAttendee($attendee); break; } } private function newTextFromProperty(array $parameters, array $value) { $value = $value['value']; return implode("\n\n", $value); } private function newAttendeeFromProperty(array $parameters, array $value) { $uri = $value['value']; switch (idx($parameters, 'PARTSTAT')) { case 'ACCEPTED': $status = PhutilCalendarUserNode::STATUS_ACCEPTED; break; case 'DECLINED': $status = PhutilCalendarUserNode::STATUS_DECLINED; break; case 'NEEDS-ACTION': default: $status = PhutilCalendarUserNode::STATUS_INVITED; break; } $name = $this->getScalarParameterValue($parameters, 'CN'); return id(new PhutilCalendarUserNode()) ->setURI($uri) ->setName($name) ->setStatus($status); } private function newDateTimeFromProperty(array $parameters, array $value) { $value = $value['value']; if (!$value) { $this->raiseParseFailure( self::PARSE_EMPTY_DATETIME, pht( 'Expected DATE-TIME to have exactly one value, found none.')); } if (count($value) > 1) { $this->raiseParseFailure( self::PARSE_MANY_DATETIME, pht( 'Expected DATE-TIME to have exactly one value, found more than '. 'one.')); } $value = head($value); $tzid = $this->getScalarParameterValue($parameters, 'TZID'); if (preg_match('/Z\z/', $value)) { if ($tzid) { $this->raiseWarning( self::WARN_TZID_UTC, pht( 'DATE-TIME "%s" uses "Z" to specify UTC, but also has a TZID '. 'parameter with value "%s". This violates RFC5545. The TZID '. 'will be ignored, and the value will be interpreted as UTC.', $value, $tzid)); } $tzid = 'UTC'; } else if ($tzid !== null) { $tzid = $this->guessTimezone($tzid); } try { $datetime = PhutilCalendarAbsoluteDateTime::newFromISO8601( $value, $tzid); } catch (Exception $ex) { $this->raiseParseFailure( self::PARSE_BAD_DATETIME, pht( 'Error parsing DATE-TIME: %s', $ex->getMessage())); } return $datetime; } private function newDurationFromProperty(array $parameters, array $value) { $value = $value['value']; if (!$value) { $this->raiseParseFailure( self::PARSE_EMPTY_DURATION, pht( 'Expected DURATION to have exactly one value, found none.')); } if (count($value) > 1) { $this->raiseParseFailure( self::PARSE_MANY_DURATION, pht( 'Expected DURATION to have exactly one value, found more than '. 'one.')); } $value = head($value); try { $duration = PhutilCalendarDuration::newFromISO8601($value); } catch (Exception $ex) { $this->raiseParseFailure( self::PARSE_BAD_DURATION, pht( 'Invalid DURATION: %s', $ex->getMessage())); } return $duration; } private function newRecurrenceRuleFromProperty(array $parameters, $value) { return PhutilCalendarRecurrenceRule::newFromRRULE($value['value']); } private function getScalarParameterValue( array $parameters, $name, $default = null) { $match = null; foreach ($parameters as $parameter) { if ($parameter['name'] == $name) { $match = $parameter; } } if ($match === null) { return $default; } $value = $match['values']; if (!$value) { // Parameter is specified, but with no value, like "KEY=". Just return // the default, as though the parameter was not specified. return $default; } if (count($value) > 1) { $this->raiseParseFailure( self::PARSE_MULTIPLE_PARAMETERS, pht( 'Expected parameter "%s" to have at most one value, but found '. 'more than one.', $name)); } return idx(head($value), 'value'); } private function guessTimezone($tzid) { $map = DateTimeZone::listIdentifiers(); $map = array_fuse($map); if (isset($map[$tzid])) { // This is a real timezone we recognize, so just use it as provided. return $tzid; } // These are alternate names for timezones. - $aliases = array( - 'Etc/GMT' => 'UTC', - ); + static $aliases; + + if ($aliases === null) { + $aliases = array( + 'Etc/GMT' => 'UTC', + ); + + // Load the map of Windows timezones. + $root_path = dirname(phutil_get_library_root('phutil')); + $windows_path = $root_path.'/resources/timezones/windows_timezones.json'; + $windows_data = Filesystem::readFile($windows_path); + $windows_zones = phutil_json_decode($windows_data); + + $aliases = $aliases + $windows_zones; + } if (isset($aliases[$tzid])) { return $aliases[$tzid]; } // Look for something that looks like "UTC+3" or "GMT -05.00". If we find - // anything + // anything, pick a timezone with that offset. $offset_pattern = '/'. '(?:UTC|GMT)'. '\s*'. '(?P[+-])'. '\s*'. '(?P\d+)'. '(?:'. '[:.](?P\d+)'. ')?'. '/i'; $matches = null; if (preg_match($offset_pattern, $tzid, $matches)) { $hours = (int)$matches['h']; $minutes = (int)idx($matches, 'm'); $offset = ($hours * 60 * 60) + ($minutes * 60); if (idx($matches, 'sign') == '-') { $offset = -$offset; } // NOTE: We could possibly do better than this, by using the event start // time to guess a timezone. However, that won't work for recurring // events and would require us to do this work after finishing initial // parsing. Since these unusual offset-based timezones appear to be rare, // the benefit may not be worth the complexity. $now = new DateTime('@'.time()); foreach ($map as $identifier) { $zone = new DateTimeZone($identifier); if ($zone->getOffset($now) == $offset) { $this->raiseWarning( self::WARN_TZID_GUESS, pht( 'TZID "%s" is unknown, guessing "%s" based on pattern "%s".', $tzid, $identifier, $matches[0])); return $identifier; } } } $this->raiseWarning( self::WARN_TZID_IGNORED, pht( 'TZID "%s" is unknown, using UTC instead.', $tzid)); return 'UTC'; } } diff --git a/src/utils/PhutilEditDistanceMatrix.php b/src/utils/PhutilEditDistanceMatrix.php index 1dd21f7..6ad5ab0 100644 --- a/src/utils/PhutilEditDistanceMatrix.php +++ b/src/utils/PhutilEditDistanceMatrix.php @@ -1,555 +1,562 @@ setSequences(str_split('ran'), str_split('rat')); * * $cost = $matrix->getEditDistance(); * * Edit distance computation is slow and requires a large amount of memory; * both are roughly O(N * M) in the length of the strings. * * You can customize the cost of insertions, deletions and replacements. By * default, each has a cost of 1. * * $matrix->setReplaceCost(1); * * By default, transpositions are not evaluated (i.e., the algorithm is * Levenshtein). You can cause transpositions to be evaluated by setting a * transpose cost (which will change the algorithm to Damerau-Levenshtein): * * $matrix->setTransposeCost(1); * * You can also provide a cost to alter the type of operation being applied. * Many strings have several equivalently expensive edit sequences, but one * some sequences are more readable to humans than others. Providing a small * cost to alter operation type tends to smooth out the sequence and produce * long runs of a single operation, which are generally more readable. For * example, these strings: * * (x) * ((x)) * * ...have edit strings "issis" and "isssi", which describe edit operations with * the same cost, but the latter string is more comprehensible to human viewers. * * If you set an alter cost, you must call @{method:setComputeString} to enable * type computation. The alter cost should generally be smaller than `c / N`, * where `c` is the smallest operational cost and `N` is the length of the * longest string. For example, if you are using the default costs (insert = 1, * delete = 1, replace = 1) and computing edit distances for strings of fewer * than 1,000 characters, you might set the alter cost to 0.001. */ final class PhutilEditDistanceMatrix extends Phobject { private $insertCost = 1; private $deleteCost = 1; private $replaceCost = 1; private $transposeCost = null; private $alterCost = 0; private $maximumLength; private $computeString; private $applySmoothing = self::SMOOTHING_NONE; + private $reachedMaximumLength; private $x; private $y; private $prefix; private $suffix; private $distanceMatrix = null; private $typeMatrix = null; const SMOOTHING_NONE = 'none'; const SMOOTHING_INTERNAL = 'internal'; const SMOOTHING_FULL = 'full'; public function setMaximumLength($maximum_length) { $this->maximumLength = $maximum_length; return $this; } public function getMaximumLength() { return coalesce($this->maximumLength, $this->getInfinity()); } + public function didReachMaximumLength() { + return $this->reachedMaximumLength; + } + public function setComputeString($compute_string) { $this->computeString = $compute_string; return $this; } public function getComputeString() { return $this->computeString; } public function setTransposeCost($transpose_cost) { $this->transposeCost = $transpose_cost; return $this; } public function getTransposeCost() { return $this->transposeCost; } public function setReplaceCost($replace_cost) { $this->replaceCost = $replace_cost; return $this; } public function getReplaceCost() { return $this->replaceCost; } public function setDeleteCost($delete_cost) { $this->deleteCost = $delete_cost; return $this; } public function getDeleteCost() { return $this->deleteCost; } public function setInsertCost($insert_cost) { $this->insertCost = $insert_cost; return $this; } public function getInsertCost() { return $this->insertCost; } public function setAlterCost($alter_cost) { $this->alterCost = $alter_cost; return $this; } public function getAlterCost() { return $this->alterCost; } public function setApplySmoothing($apply_smoothing) { $this->applySmoothing = $apply_smoothing; return $this; } public function getApplySmoothing() { return $this->applySmoothing; } public function setSequences(array $x, array $y) { // NOTE: We strip common prefixes and suffixes from the inputs because // the runtime of the edit distance algorithm is large and it is common // to diff similar strings. $xl = count($x); $yl = count($y); $l = min($xl, $yl); $prefix_l = 0; $suffix_l = 0; for ($ii = 0; $ii < $l; $ii++) { if ($x[$ii] !== $y[$ii]) { break; } $prefix_l++; } for ($ii = 1; $ii <= ($l - $prefix_l); $ii++) { if ($x[$xl - $ii] !== $y[$yl - $ii]) { break; } $suffix_l++; } $this->prefix = array_slice($x, 0, $prefix_l); $this->suffix = array_slice($x, $xl - $suffix_l); $this->x = array_slice($x, $prefix_l, $xl - ($suffix_l + $prefix_l)); $this->y = array_slice($y, $prefix_l, $yl - ($suffix_l + $prefix_l)); $this->distanceMatrix = null; return $this; } private function requireSequences() { if ($this->x === null) { throw new PhutilInvalidStateException('setSequences'); } } public function getEditDistance() { $this->requireSequences(); $x = $this->x; $y = $this->y; if (!$x) { return $this->insertCost * count($y); } if (!$y) { return $this->deleteCost * count($x); } $max = $this->getMaximumLength(); if (count($x) > $max || count($y) > $max) { + $this->reachedMaximumLength = true; return ($this->insertCost * count($y)) + ($this->deleteCost * count($x)); } if ($x === $y) { return 0; } $matrix = $this->getDistanceMatrix(); return $matrix[count($x)][count($y)]; } /** * Return a string representing the edits between the sequences. The string * has these characters: * * - s (same): Same character in both strings. * - i (insert): Character inserted. * - d (delete): Character deleted. * - x (replace): Character replaced. * - t (transpose): Character transposed. */ public function getEditString() { $this->requireSequences(); $x = $this->x; $y = $this->y; if (!$x && !$y) { return $this->padEditString(''); } if (!$x) { return $this->padEditString(str_repeat('i', count($y))); } if (!$y) { return $this->padEditString(str_repeat('d', count($x))); } if ($x === $y) { return $this->padEditString(str_repeat('s', count($x))); } $max = $this->getMaximumLength(); if (count($x) > $max || count($y) > $max) { + $this->reachedMaximumLength = true; return $this->padEditString( str_repeat('d', count($x)). str_repeat('i', count($y))); } $matrix = $this->getTypeMatrix(); $xx = count($x); $yy = count($y); $transposes = array(); $str = ''; while (true) { $type = $matrix[$xx][$yy]; if (is_array($type)) { $chr = 't'; $transposes[$type[0]][$type[1]] = true; $type = $type[2]; } else { $chr = $type; } if (isset($transposes[$xx][$yy])) { $chr = 't'; } if ($type == 's' || $type == 'x') { $xx -= 1; $yy -= 1; } else if ($type == 'i') { $yy -= 1; } else if ($type == 'd') { $xx -= 1; } else { throw new Exception(pht("Unknown type '%s' in type matrix.", $type)); } $str .= $chr; if ($xx <= 0 && $yy <= 0) { break; } } $str = strrev($str); // For full smoothing, we pad the edit string before smoothing it, so // ranges of similar characters at the beginning or end of the string can // also be smoothed. // For internal smoothing, we only smooth ranges within the change itself. $smoothing = $this->getApplySmoothing(); switch ($smoothing) { case self::SMOOTHING_FULL: $str = $this->padEditString($str); $str = $this->applySmoothing($str, true); break; case self::SMOOTHING_INTERNAL: $str = $this->applySmoothing($str, false); $str = $this->padEditString($str); break; case self::SMOOTHING_NONE: $str = $this->padEditString($str); break; default: throw new Exception( pht( 'Unknown smoothing type "%s".', $smoothing)); } return $str; } private function padEditString($str) { if ($this->prefix) { $str = str_repeat('s', count($this->prefix)).$str; } if ($this->suffix) { $str = $str.str_repeat('s', count($this->suffix)); } return $str; } private function getTypeMatrix() { if (!$this->computeString) { throw new PhutilInvalidStateException('setComputeString'); } if ($this->typeMatrix === null) { $this->computeMatrix($this->x, $this->y); } return $this->typeMatrix; } private function getDistanceMatrix() { if ($this->distanceMatrix === null) { $this->computeMatrix($this->x, $this->y); } return $this->distanceMatrix; } private function computeMatrix(array $x, array $y) { $xl = count($x); $yl = count($y); $m = array(); $infinity = $this->getInfinity(); $use_damerau = ($this->transposeCost !== null); if ($use_damerau) { // Initialize the default cost of a transpose. $m[-1][-1] = $infinity; // Initialize the character position dictionary for Damerau. $d = array(); foreach ($x as $c) { $d[$c] = -1; } foreach ($y as $c) { $d[$c] = -1; } // Initialize the transpose costs for Damerau. for ($xx = 0; $xx <= $xl; $xx++) { $m[$xx][-1] = $infinity; } for ($yy = 0; $yy <= $yl; $yy++) { $m[-1][$yy] = $infinity; } } // Initialize the top row of the matrix. for ($xx = 0; $xx <= $xl; $xx++) { $m[$xx][0] = $xx * $this->deleteCost; } // Initialize the left column of the matrix. $cost = 0; for ($yy = 0; $yy <= $yl; $yy++) { $m[0][$yy] = $yy * $this->insertCost; } $use_types = ($this->computeString); if ($use_types) { $t = array(); for ($xx = 0; $xx <= $xl; $xx++) { $t[$xx][0] = 'd'; } for ($yy = 0; $yy <= $yl; $yy++) { $t[0][$yy] = 'i'; } $t[0][0] = 's'; } $alt_cost = $this->getAlterCost(); if ($alt_cost && !$use_types) { throw new Exception( pht( 'If you provide an alter cost with %s, you must enable '. 'type computation with %s.', 'setAlterCost()', 'setComputeStrings()')); } // Build the edit distance matrix. for ($xx = 1; $xx <= $xl; $xx++) { $last_dy = -1; for ($yy = 1; $yy <= $yl; $yy++) { if ($use_damerau) { $dx = $d[$y[$yy - 1]]; $dy = $last_dy; } if ($x[$xx - 1] === $y[$yy - 1]) { $rep_cost = $m[$xx - 1][$yy - 1] + 0; $rep_type = 's'; } else { $rep_cost = $m[$xx - 1][$yy - 1] + $this->replaceCost; $rep_type = 'x'; } $del_cost = $m[$xx - 1][$yy ] + $this->deleteCost; $ins_cost = $m[$xx ][$yy - 1] + $this->insertCost; if ($alt_cost) { $del_char = $t[$xx - 1][$yy ]; $ins_char = $t[$xx ][$yy - 1]; $rep_char = $t[$xx - 1][$yy - 1]; if ($del_char !== 'd') { $del_cost += $alt_cost; } if ($ins_char !== 'i') { $ins_cost += $alt_cost; } if ($rep_char !== $rep_type) { $rep_cost += $alt_cost; } } if ($rep_cost <= $del_cost && $rep_cost <= $ins_cost) { $cost = $rep_cost; $type = $rep_type; if ($rep_type === 's') { $last_dy = $yy - 1; } } else if ($ins_cost <= $del_cost) { $cost = $ins_cost; $type = 'i'; } else { $cost = $del_cost; $type = 'd'; } if ($use_damerau) { $trn_count = (($xx - $dx - 2) + ($yy - $dy - 2) + 1); $trn_cost = $m[$dx][$dy] + ($trn_count * $this->transposeCost); if ($trn_cost < $cost) { $cost = $trn_cost; $type = array($dx + 1, $dy + 1, $type); } } $m[$xx][$yy] = $cost; if ($use_types) { $t[$xx][$yy] = $type; } } if ($use_damerau) { $d[$x[$xx - 1]] = ($xx - 1); } } $this->distanceMatrix = $m; if ($use_types) { $this->typeMatrix = $t; } } private function getInfinity() { return INF; } private function printMatrix(array $m) { $x = $this->x; $y = $this->y; $p = '% 8s '; printf($p, ' '); foreach (head($m) as $yk => $yv) { printf($p, idx($y, $yk - 1, '-')); } echo "\n"; foreach ($m as $xk => $xv) { printf($p, idx($x, $xk - 1, '-')); foreach ($xv as $yk => $yv) { printf($p, ($yv == $this->getInfinity() ? 'inf' : $yv)); } echo "\n"; } } private function printTypeMatrix(array $t) { $x = $this->x; $y = $this->y; $p = '% 8s '; printf($p, ' '); foreach (head($t) as $yk => $yv) { printf($p, idx($y, $yk - 1, '-')); } echo "\n"; foreach ($t as $xk => $xv) { printf($p, idx($x, $xk - 1, '-')); foreach ($xv as $yk => $yv) { printf($p, ($yv == $this->getInfinity() ? 'inf' : $yv)); } echo "\n"; } } private function applySmoothing($str, $full) { if ($full) { $prefix = '(^|[xdi])'; $suffix = '([xdi]|\z)'; } else { $prefix = '([xdi])'; $suffix = '([xdi])'; } // Smooth the string out, by replacing short runs of similar characters // with 'x' operations. This makes the result more readable to humans, // since there are fewer choppy runs of short added and removed substrings. do { $original = $str; $str = preg_replace('/'.$prefix.'(s{3})'.$suffix.'/', '$1xxx$3', $str); $str = preg_replace('/'.$prefix.'(s{2})'.$suffix.'/', '$1xx$3', $str); $str = preg_replace('/'.$prefix.'(s{1})'.$suffix.'/', '$1x$3', $str); } while ($str != $original); return $str; } } diff --git a/src/utils/PhutilProseDiff.php b/src/utils/PhutilProseDiff.php index 5507e52..df34d64 100644 --- a/src/utils/PhutilProseDiff.php +++ b/src/utils/PhutilProseDiff.php @@ -1,286 +1,292 @@ parts[] = array( 'type' => $type, 'text' => $text, ); return $this; } public function getParts() { return $this->parts; } /** * Get diff parts, but replace large blocks of unchanged text with "." * parts representing missing context. */ public function getSummaryParts() { $parts = $this->getParts(); $head_key = head_key($parts); $last_key = last_key($parts); $results = array(); foreach ($parts as $key => $part) { $is_head = ($key == $head_key); $is_last = ($key == $last_key); switch ($part['type']) { case '=': $pieces = $this->splitTextForSummary($part['text']); if ($is_head || $is_last) { $need = 2; } else { $need = 3; } // We don't have enough pieces to omit anything, so just continue. if (count($pieces) < $need) { $results[] = $part; break; } if (!$is_head) { $results[] = array( 'type' => '=', 'text' => head($pieces), ); } $results[] = array( 'type' => '.', 'text' => null, ); if (!$is_last) { $results[] = array( 'type' => '=', 'text' => last($pieces), ); } break; default: $results[] = $part; break; } } return $results; } public function reorderParts() { // Reorder sequences of removed and added sections to put all the "-" // parts together first, then all the "+" parts together. This produces // a more human-readable result than intermingling them. $o_run = array(); $n_run = array(); $result = array(); foreach ($this->parts as $part) { $type = $part['type']; switch ($type) { case '-': $o_run[] = $part; break; case '+': $n_run[] = $part; break; default: if ($o_run || $n_run) { foreach ($this->combineRuns($o_run, $n_run) as $merged_part) { $result[] = $merged_part; } $o_run = array(); $n_run = array(); } $result[] = $part; break; } } if ($o_run || $n_run) { foreach ($this->combineRuns($o_run, $n_run) as $part) { $result[] = $part; } } // Now, combine consecuitive runs of the same type of change (like a // series of "-" parts) into a single run. $combined = array(); $last = null; $last_text = null; foreach ($result as $part) { $type = $part['type']; if ($last !== $type) { if ($last !== null) { $combined[] = array( 'type' => $last, 'text' => $last_text, ); } $last_text = null; $last = $type; } $last_text .= $part['text']; } if ($last_text !== null) { $combined[] = array( 'type' => $last, 'text' => $last_text, ); } $this->parts = $combined; return $this; } private function combineRuns($o_run, $n_run) { $o_merge = $this->mergeParts($o_run); $n_merge = $this->mergeParts($n_run); // When removed and added blocks share a prefix or suffix, we sometimes // want to count it as unchanged (for example, if it is whitespace) but // sometimes want to count it as changed (for example, if it is a word // suffix like "ing"). Find common prefixes and suffixes of these layout // characters and emit them as "=" (unchanged) blocks. $layout_characters = array( ' ' => true, "\n" => true, '.' => true, '!' => true, ',' => true, '?' => true, + ']' => true, + '[' => true, + '(' => true, + ')' => true, + '<' => true, + '>' => true, ); $o_text = $o_merge['text']; $n_text = $n_merge['text']; $o_len = strlen($o_text); $n_len = strlen($n_text); $min_len = min($o_len, $n_len); $prefix_len = 0; for ($pos = 0; $pos < $min_len; $pos++) { $o = $o_text[$pos]; $n = $n_text[$pos]; if ($o !== $n) { break; } if (empty($layout_characters[$o])) { break; } $prefix_len++; } $suffix_len = 0; for ($pos = 0; $pos < ($min_len - $prefix_len); $pos++) { $o = $o_text[$o_len - ($pos + 1)]; $n = $n_text[$n_len - ($pos + 1)]; if ($o !== $n) { break; } if (empty($layout_characters[$o])) { break; } $suffix_len++; } $results = array(); if ($prefix_len) { $results[] = array( 'type' => '=', 'text' => substr($o_text, 0, $prefix_len), ); } if ($prefix_len < $o_len) { $results[] = array( 'type' => '-', 'text' => substr( $o_text, $prefix_len, $o_len - $prefix_len - $suffix_len), ); } if ($prefix_len < $n_len) { $results[] = array( 'type' => '+', 'text' => substr( $n_text, $prefix_len, $n_len - $prefix_len - $suffix_len), ); } if ($suffix_len) { $results[] = array( 'type' => '=', 'text' => substr($o_text, -$suffix_len), ); } return $results; } private function mergeParts(array $parts) { $text = ''; $type = null; foreach ($parts as $part) { $part_type = $part['type']; if ($type === null) { $type = $part_type; } if ($type !== $part_type) { throw new Exception(pht('Can not merge parts of dissimilar types!')); } $text .= $part['text']; } return array( 'type' => $type, 'text' => $text, ); } private function splitTextForSummary($text) { $matches = null; $ok = preg_match('/^(\n*[^\n]+)\n/', $text, $matches); if (!$ok) { return array($text); } $head = $matches[1]; $text = substr($text, strlen($head)); $ok = preg_match('/\n([^\n]+\n*)\z/', $text, $matches); if (!$ok) { return array($text); } $last = $matches[1]; $text = substr($text, 0, -strlen($last)); if (!strlen(trim($text))) { return array($head, $last); } else { return array($head, $text, $last); } } } diff --git a/src/utils/PhutilProseDifferenceEngine.php b/src/utils/PhutilProseDifferenceEngine.php index 6000f1a..5deb8e6 100644 --- a/src/utils/PhutilProseDifferenceEngine.php +++ b/src/utils/PhutilProseDifferenceEngine.php @@ -1,199 +1,209 @@ buildDiff($u, $v, 1); + return $this->buildDiff($u, $v, 0); } private function buildDiff($u, $v, $level) { $u_parts = $this->splitCorpus($u, $level); $v_parts = $this->splitCorpus($v, $level); $matrix = id(new PhutilEditDistanceMatrix()) ->setMaximumLength(128) ->setSequences($u_parts, $v_parts) ->setComputeString(true); // For word-level and character-level changes, smooth the output string // to reduce the choppiness of the diff. if ($level > 1) { $matrix->setApplySmoothing(PhutilEditDistanceMatrix::SMOOTHING_FULL); } $u_pos = 0; $v_pos = 0; $edits = $matrix->getEditString(); $edits_length = strlen($edits); $diff = new PhutilProseDiff(); for ($ii = 0; $ii < $edits_length; $ii++) { $c = $edits[$ii]; if ($c == 's') { $diff->addPart('=', $u_parts[$u_pos]); $u_pos++; $v_pos++; } else if ($c == 'd') { $diff->addPart('-', $u_parts[$u_pos]); $u_pos++; } else if ($c == 'i') { $diff->addPart('+', $v_parts[$v_pos]); $v_pos++; } else if ($c == 'x') { $diff->addPart('-', $u_parts[$u_pos]); $diff->addPart('+', $v_parts[$v_pos]); $u_pos++; $v_pos++; } else { throw new Exception( pht( 'Unexpected character ("%s") in edit string.', $c)); } } $diff->reorderParts(); // If we just built a character-level diff, we're all done and do not // need to go any deeper. if ($level == 3) { return $diff; } $blocks = array(); $block = null; foreach ($diff->getParts() as $part) { $type = $part['type']; $text = $part['text']; switch ($type) { case '=': if ($block) { $blocks[] = $block; $block = null; } $blocks[] = array( 'type' => $type, 'text' => $text, ); break; case '-': if (!$block) { $block = array( 'type' => '!', 'old' => '', 'new' => '', ); } $block['old'] .= $text; break; case '+': if (!$block) { $block = array( 'type' => '!', 'old' => '', 'new' => '', ); } $block['new'] .= $text; break; } } if ($block) { $blocks[] = $block; } $result = new PhutilProseDiff(); foreach ($blocks as $block) { $type = $block['type']; if ($type == '=') { $result->addPart('=', $block['text']); } else { $old = $block['old']; $new = $block['new']; if (!strlen($old) && !strlen($new)) { // Nothing to do. } else if (!strlen($old)) { $result->addPart('+', $new); } else if (!strlen($new)) { $result->addPart('-', $old); } else { - $subdiff = $this->buildDiff( - $old, - $new, - $level + 1); - - foreach ($subdiff->getParts() as $part) { - $result->addPart($part['type'], $part['text']); + if ($matrix->didReachMaximumLength()) { + // If this text was too big to diff, don't try to subdivide it. + $result->addPart('-', $old); + $result->addPart('+', $new); + } else { + $subdiff = $this->buildDiff( + $old, + $new, + $level + 1); + + foreach ($subdiff->getParts() as $part) { + $result->addPart($part['type'], $part['text']); + } } } } } $result->reorderParts(); return $result; } private function splitCorpus($corpus, $level) { switch ($level) { + case 0: + // Level 0: Split into paragraphs. + $expr = '/([\n]+)/'; + break; case 1: // Level 1: Split into sentences. $expr = '/([\n,!;?\.]+)/'; break; case 2: // Level 2: Split into words. $expr = '/(\s+)/'; break; case 3: // Level 3: Split into characters. return phutil_utf8v_combined($corpus); } $pieces = preg_split($expr, $corpus, -1, PREG_SPLIT_DELIM_CAPTURE); return $this->stitchPieces($pieces, $level); } private function stitchPieces(array $pieces, $level) { $results = array(); $count = count($pieces); for ($ii = 0; $ii < $count; $ii += 2) { $result = $pieces[$ii]; if ($ii + 1 < $count) { $result .= $pieces[$ii + 1]; } - if ($level == 1) { + if ($level < 2) { // Split pieces into separate text and whitespace sections: make one // piece out of all the whitespace at the beginning, one piece out of // all the actual text in the middle, and one piece out of all the // whitespace at the end. $matches = null; preg_match('/^(\s*)(.*?)(\s*)\z/', $result, $matches); if (strlen($matches[1])) { $results[] = $matches[1]; } if (strlen($matches[2])) { $results[] = $matches[2]; } if (strlen($matches[3])) { $results[] = $matches[3]; } } else { $results[] = $result; } } // If the input ended with a delimiter, we can get an empty final piece. // Just discard it. if (last($results) == '') { array_pop($results); } return $results; } } diff --git a/src/utils/__tests__/PhutilProseDiffTestCase.php b/src/utils/__tests__/PhutilProseDiffTestCase.php index 8d40484..bf0375e 100644 --- a/src/utils/__tests__/PhutilProseDiffTestCase.php +++ b/src/utils/__tests__/PhutilProseDiffTestCase.php @@ -1,204 +1,245 @@ assertProseParts( '', '', array(), pht('Empty')); $this->assertProseParts( "xxx\nyyy", "xxx\nzzz\nyyy", array( "= xxx\n", "+ zzz\n", '= yyy', ), pht('Add Paragraph')); $this->assertProseParts( "xxx\nzzz\nyyy", "xxx\nyyy", array( "= xxx\n", "- zzz\n", '= yyy', ), pht('Remove Paragraph')); // Without smoothing, the alogorithm identifies that "shark" and "cat" // both contain the letter "a" and tries to express this as a very // fine-grained edit which replaces "sh" with "c" and then "rk" with "t". // This is technically correct, but it is much easier for human viewers to // parse if we smooth this into a single removal and a single addition. $this->assertProseParts( 'They say the shark has nine lives.', 'They say the cat has nine lives.', array( '= They say the ', '- shark', '+ cat', '= has nine lives.', ), pht('"Shark/cat" word edit smoothenss.')); $this->assertProseParts( 'Rising quickly, she says', 'Rising quickly, she remarks:', array( '= Rising quickly, she ', '- says', '+ remarks:', ), pht('"Says/remarks" word edit smoothenss.')); $this->assertProseParts( 'See screenshots', 'Viewed video files', array( '- See screenshots', '+ Viewed video files', ), pht('Complete paragraph rewrite.')); $this->assertProseParts( 'xaaax', 'xbbbx', array( '- xaaax', '+ xbbbx', ), pht('Whole word rewrite with common prefix and suffix.')); $this->assertProseParts( ' aaa ', ' bbb ', array( '= ', '- aaa', '+ bbb', '= ', ), pht('Whole word rewrite with whitespace prefix and suffix.')); $this->assertSummaryProseParts( "a\nb\nc\nd\ne\nf\ng\nh\n", "a\nb\nc\nd\nX\nf\ng\nh\n", array( '.', "= d\n", '- e', '+ X', "= \nf", '.', ), pht('Summary diff with middle change.')); $this->assertSummaryProseParts( "a\nb\nc\nd\ne\nf\ng\nh\n", "X\nb\nc\nd\ne\nf\ng\nh\n", array( '- a', '+ X', "= \nb", '.', ), pht('Summary diff with head change.')); $this->assertSummaryProseParts( "a\nb\nc\nd\ne\nf\ng\nh\n", "a\nb\nc\nd\ne\nf\ng\nX\n", array( '.', "= g\n", '- h', '+ X', "= \n", ), pht('Summary diff with last change.')); $this->assertProseParts( 'aaa aaa aaa aaa, bbb bbb bbb bbb.', "aaa aaa aaa aaa, bbb bbb bbb bbb.\n\n- ccc ccc ccc", array( '= aaa aaa aaa aaa, bbb bbb bbb bbb.', "+ \n\n- ccc ccc ccc", ), pht('Diff with new trailing content.')); $this->assertProseParts( 'aaa aaa aaa aaa, bbb bbb bbb bbb.', 'aaa aaa aaa aaa bbb bbb bbb bbb.', array( '= aaa aaa aaa aaa', '- ,', '= bbb bbb bbb bbb.', ), pht('Diff with a removed comma.')); $this->assertProseParts( 'aaa aaa aaa aaa, bbb bbb bbb bbb.', "aaa aaa aaa aaa bbb bbb bbb bbb.\n\n- ccc ccc ccc!", array( '= aaa aaa aaa aaa', '- ,', '= bbb bbb bbb bbb.', "+ \n\n- ccc ccc ccc!", ), pht('Diff with a removed comma and new trailing content.')); + $this->assertProseParts( + '[ ] Walnuts', + '[X] Walnuts', + array( + '= [', + '- ', + '+ X', + '= ] Walnuts', + ), + pht('Diff adding a tickmark to a checkbox list.')); + + $this->assertProseParts( + '[[ ./week49 ]]', + '[[ ./week50 ]]', + array( + '= [[ ./week', + '- 49', + '+ 50', + '= ]]', + ), + pht('Diff changing a remarkup wiki link target.')); + + // Create a large corpus with many sentences and paragraphs. + $large_paragraph = 'xyz. '; + $large_paragraph = str_repeat($large_paragraph, 50); + $large_paragraph = rtrim($large_paragraph); + + $large_corpus = $large_paragraph."\n\n"; + $large_corpus = str_repeat($large_corpus, 50); + $large_corpus = rtrim($large_corpus); + + $this->assertProseParts( + $large_corpus, + "aaa\n\n".$large_corpus."\n\nzzz", + array( + "+ aaa\n\n", + '= '.$large_corpus, + "+ \n\nzzz", + ), + pht('Adding initial and final lines to a large corpus.')); + } private function assertProseParts($old, $new, array $expect_parts, $label) { $engine = new PhutilProseDifferenceEngine(); $diff = $engine->getDiff($old, $new); $parts = $diff->getParts(); $this->assertParts($expect_parts, $parts, $label); } private function assertSummaryProseParts( $old, $new, array $expect_parts, $label) { $engine = new PhutilProseDifferenceEngine(); $diff = $engine->getDiff($old, $new); $parts = $diff->getSummaryParts(); $this->assertParts($expect_parts, $parts, $label); } private function assertParts( array $expect, array $actual_parts, $label) { $actual = array(); foreach ($actual_parts as $actual_part) { $type = $actual_part['type']; $text = $actual_part['text']; switch ($type) { case '.': $actual[] = $type; break; default: $actual[] = "{$type} {$text}"; break; } } $this->assertEqual($expect, $actual, $label); } }