diff --git a/externals/cldr/cldr_windows_timezones.xml b/externals/cldr/cldr_windows_timezones.xml
new file mode 100644
index 0000000..47b689d
--- /dev/null
+++ b/externals/cldr/cldr_windows_timezones.xml
@@ -0,0 +1,769 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/resources/timezones/windows_timezones.json b/resources/timezones/windows_timezones.json
new file mode 100644
index 0000000..7a287b6
--- /dev/null
+++ b/resources/timezones/windows_timezones.json
@@ -0,0 +1,126 @@
+{
+ "Egypt Standard Time": "Africa/Cairo",
+ "Morocco Standard Time": "Africa/Casablanca",
+ "South Africa Standard Time": "Africa/Johannesburg",
+ "W. Central Africa Standard Time": "Africa/Lagos",
+ "E. Africa Standard Time": "Africa/Nairobi",
+ "Libya Standard Time": "Africa/Tripoli",
+ "Namibia Standard Time": "Africa/Windhoek",
+ "Aleutian Standard Time": "America/Adak",
+ "Alaskan Standard Time": "America/Anchorage",
+ "Tocantins Standard Time": "America/Araguaina",
+ "Paraguay Standard Time": "America/Asuncion",
+ "Bahia Standard Time": "America/Bahia",
+ "SA Pacific Standard Time": "America/Bogota",
+ "Argentina Standard Time": "America/Buenos_Aires",
+ "Eastern Standard Time (Mexico)": "America/Cancun",
+ "Venezuela Standard Time": "America/Caracas",
+ "SA Eastern Standard Time": "America/Cayenne",
+ "Central Standard Time": "America/Chicago",
+ "Mountain Standard Time (Mexico)": "America/Chihuahua",
+ "Central Brazilian Standard Time": "America/Cuiaba",
+ "Mountain Standard Time": "America/Denver",
+ "Greenland Standard Time": "America/Godthab",
+ "Turks And Caicos Standard Time": "America/Grand_Turk",
+ "Central America Standard Time": "America/Guatemala",
+ "Atlantic Standard Time": "America/Halifax",
+ "Cuba Standard Time": "America/Havana",
+ "US Eastern Standard Time": "America/Indianapolis",
+ "SA Western Standard Time": "America/La_Paz",
+ "Pacific Standard Time": "America/Los_Angeles",
+ "Central Standard Time (Mexico)": "America/Mexico_City",
+ "Saint Pierre Standard Time": "America/Miquelon",
+ "Montevideo Standard Time": "America/Montevideo",
+ "Eastern Standard Time": "America/New_York",
+ "US Mountain Standard Time": "America/Phoenix",
+ "Haiti Standard Time": "America/Port-au-Prince",
+ "Canada Central Standard Time": "America/Regina",
+ "Pacific SA Standard Time": "America/Santiago",
+ "E. South America Standard Time": "America/Sao_Paulo",
+ "Newfoundland Standard Time": "America/St_Johns",
+ "Pacific Standard Time (Mexico)": "America/Tijuana",
+ "Central Asia Standard Time": "Asia/Almaty",
+ "Jordan Standard Time": "Asia/Amman",
+ "Arabic Standard Time": "Asia/Baghdad",
+ "Azerbaijan Standard Time": "Asia/Baku",
+ "SE Asia Standard Time": "Asia/Bangkok",
+ "Altai Standard Time": "Asia/Barnaul",
+ "Middle East Standard Time": "Asia/Beirut",
+ "India Standard Time": "Asia/Calcutta",
+ "Transbaikal Standard Time": "Asia/Chita",
+ "Sri Lanka Standard Time": "Asia/Colombo",
+ "Syria Standard Time": "Asia/Damascus",
+ "Bangladesh Standard Time": "Asia/Dhaka",
+ "Arabian Standard Time": "Asia/Dubai",
+ "West Bank Standard Time": "Asia/Hebron",
+ "W. Mongolia Standard Time": "Asia/Hovd",
+ "North Asia East Standard Time": "Asia/Irkutsk",
+ "Israel Standard Time": "Asia/Jerusalem",
+ "Afghanistan Standard Time": "Asia/Kabul",
+ "Russia Time Zone 11": "Asia/Kamchatka",
+ "Pakistan Standard Time": "Asia/Karachi",
+ "Nepal Standard Time": "Asia/Katmandu",
+ "North Asia Standard Time": "Asia/Krasnoyarsk",
+ "Magadan Standard Time": "Asia/Magadan",
+ "N. Central Asia Standard Time": "Asia/Novosibirsk",
+ "Omsk Standard Time": "Asia/Omsk",
+ "North Korea Standard Time": "Asia/Pyongyang",
+ "Myanmar Standard Time": "Asia/Rangoon",
+ "Arab Standard Time": "Asia/Riyadh",
+ "Sakhalin Standard Time": "Asia/Sakhalin",
+ "Korea Standard Time": "Asia/Seoul",
+ "China Standard Time": "Asia/Shanghai",
+ "Singapore Standard Time": "Asia/Singapore",
+ "Russia Time Zone 10": "Asia/Srednekolymsk",
+ "Taipei Standard Time": "Asia/Taipei",
+ "West Asia Standard Time": "Asia/Tashkent",
+ "Georgian Standard Time": "Asia/Tbilisi",
+ "Iran Standard Time": "Asia/Tehran",
+ "Tokyo Standard Time": "Asia/Tokyo",
+ "Tomsk Standard Time": "Asia/Tomsk",
+ "Ulaanbaatar Standard Time": "Asia/Ulaanbaatar",
+ "Vladivostok Standard Time": "Asia/Vladivostok",
+ "Yakutsk Standard Time": "Asia/Yakutsk",
+ "Ekaterinburg Standard Time": "Asia/Yekaterinburg",
+ "Caucasus Standard Time": "Asia/Yerevan",
+ "Azores Standard Time": "Atlantic/Azores",
+ "Cape Verde Standard Time": "Atlantic/Cape_Verde",
+ "Greenwich Standard Time": "Atlantic/Reykjavik",
+ "Cen. Australia Standard Time": "Australia/Adelaide",
+ "E. Australia Standard Time": "Australia/Brisbane",
+ "AUS Central Standard Time": "Australia/Darwin",
+ "Aus Central W. Standard Time": "Australia/Eucla",
+ "Tasmania Standard Time": "Australia/Hobart",
+ "Lord Howe Standard Time": "Australia/Lord_Howe",
+ "W. Australia Standard Time": "Australia/Perth",
+ "AUS Eastern Standard Time": "Australia/Sydney",
+ "Dateline Standard Time": "Etc/GMT+12",
+ "Astrakhan Standard Time": "Europe/Astrakhan",
+ "W. Europe Standard Time": "Europe/Berlin",
+ "GTB Standard Time": "Europe/Bucharest",
+ "Central Europe Standard Time": "Europe/Budapest",
+ "E. Europe Standard Time": "Europe/Chisinau",
+ "Turkey Standard Time": "Europe/Istanbul",
+ "Kaliningrad Standard Time": "Europe/Kaliningrad",
+ "FLE Standard Time": "Europe/Kiev",
+ "GMT Standard Time": "Europe/London",
+ "Belarus Standard Time": "Europe/Minsk",
+ "Russian Standard Time": "Europe/Moscow",
+ "Romance Standard Time": "Europe/Paris",
+ "Russia Time Zone 3": "Europe/Samara",
+ "Central European Standard Time": "Europe/Warsaw",
+ "Mauritius Standard Time": "Indian/Mauritius",
+ "Samoa Standard Time": "Pacific/Apia",
+ "New Zealand Standard Time": "Pacific/Auckland",
+ "Bougainville Standard Time": "Pacific/Bougainville",
+ "Chatham Islands Standard Time": "Pacific/Chatham",
+ "Easter Island Standard Time": "Pacific/Easter",
+ "Fiji Standard Time": "Pacific/Fiji",
+ "Central Pacific Standard Time": "Pacific/Guadalcanal",
+ "Hawaiian Standard Time": "Pacific/Honolulu",
+ "Line Islands Standard Time": "Pacific/Kiritimati",
+ "Marquesas Standard Time": "Pacific/Marquesas",
+ "Norfolk Standard Time": "Pacific/Norfolk",
+ "West Pacific Standard Time": "Pacific/Port_Moresby",
+ "Tonga Standard Time": "Pacific/Tongatapu"
+}
diff --git a/scripts/timezones/generate_windows_timezone_map.php b/scripts/timezones/generate_windows_timezone_map.php
new file mode 100755
index 0000000..41b06be
--- /dev/null
+++ b/scripts/timezones/generate_windows_timezone_map.php
@@ -0,0 +1,46 @@
+#!/usr/bin/env php
+windowsZones->mapTimezones->mapZone;
+foreach ($zones as $zone) {
+ $windows_name = (string)$zone['other'];
+ $target_name = (string)$zone['type'];
+
+ // Ignore the offset-based timezones from the CLDR map, since we handle
+ // these later.
+ if (isset($ignore[$windows_name])) {
+ continue;
+ }
+
+ // We've already seen this timezone so we don't need to add it to the map
+ // again.
+ if (isset($result_map[$windows_name])) {
+ continue;
+ }
+
+ $result_map[$windows_name] = $target_name;
+}
+
+asort($result_map);
+
+echo id(new PhutilJSON())
+ ->encodeFormatted($result_map);
diff --git a/src/parser/calendar/data/PhutilCalendarAbsoluteDateTime.php b/src/parser/calendar/data/PhutilCalendarAbsoluteDateTime.php
index 4fbb2f6..0129e6c 100644
--- a/src/parser/calendar/data/PhutilCalendarAbsoluteDateTime.php
+++ b/src/parser/calendar/data/PhutilCalendarAbsoluteDateTime.php
@@ -1,275 +1,287 @@
\d{4})(?P\d{2})(?P\d{2})'.
'(?:'.
'T(?P\d{2})(?P\d{2})(?P\d{2})(?Z)?'.
')?'.
'\z/';
$matches = null;
$ok = preg_match($pattern, $value, $matches);
if (!$ok) {
throw new Exception(
pht(
'Expected ISO8601 datetime in the format "19990105T112233Z", '.
'found "%s".',
$value));
}
if (isset($matches['z'])) {
if ($timezone != 'UTC') {
throw new Exception(
pht(
'ISO8601 date ends in "Z" indicating UTC, but a timezone other '.
'than UTC ("%s") was specified.',
$timezone));
}
}
$datetime = id(new self())
->setYear((int)$matches['y'])
->setMonth((int)$matches['m'])
->setDay((int)$matches['d'])
->setTimezone($timezone);
if (isset($matches['h'])) {
$datetime
->setHour((int)$matches['h'])
->setMinute((int)$matches['i'])
->setSecond((int)$matches['s']);
} else {
$datetime
->setIsAllDay(true);
}
return $datetime;
}
public static function newFromEpoch($epoch, $timezone = 'UTC') {
$date = new DateTime('@'.$epoch);
$zone = new DateTimeZone($timezone);
$date->setTimezone($zone);
return id(new self())
->setYear((int)$date->format('Y'))
->setMonth((int)$date->format('m'))
->setDay((int)$date->format('d'))
->setHour((int)$date->format('H'))
->setMinute((int)$date->format('i'))
->setSecond((int)$date->format('s'))
->setTimezone($timezone);
}
public static function newFromDictionary(array $dict) {
static $keys;
if ($keys === null) {
$keys = array_fuse(
array(
'kind',
'year',
'month',
'day',
'hour',
'minute',
'second',
'timezone',
'isAllDay',
));
}
foreach ($dict as $key => $value) {
if (!isset($keys[$key])) {
throw new Exception(
pht(
'Unexpected key "%s" in datetime dictionary, expected keys: %s.',
$key,
implode(', ', array_keys($keys))));
}
}
if (idx($dict, 'kind') !== 'absolute') {
throw new Exception(
pht(
'Expected key "%s" with value "%s" in datetime dictionary.',
'kind',
'absolute'));
}
if (!isset($dict['year'])) {
throw new Exception(
pht(
'Expected key "%s" in datetime dictionary.',
'year'));
}
$datetime = id(new self())
->setYear(idx($dict, 'year'))
->setMonth(idx($dict, 'month', 1))
->setDay(idx($dict, 'day', 1))
->setHour(idx($dict, 'hour', 0))
->setMinute(idx($dict, 'minute', 0))
->setSecond(idx($dict, 'second', 0))
->setTimezone(idx($dict, 'timezone'))
->setIsAllDay((bool)idx($dict, 'isAllDay', false));
return $datetime;
}
public function newRelativeDateTime($duration) {
if (is_string($duration)) {
$duration = PhutilCalendarDuration::newFromISO8601($duration);
}
if (!($duration instanceof PhutilCalendarDuration)) {
throw new Exception(
pht(
'Expected "PhutilCalendarDuration" object or ISO8601 duration '.
'string.'));
}
return id(new PhutilCalendarRelativeDateTime())
->setOrigin($this)
->setDuration($duration);
}
public function toDictionary() {
return array(
'kind' => 'absolute',
'year' => (int)$this->getYear(),
'month' => (int)$this->getMonth(),
'day' => (int)$this->getDay(),
'hour' => (int)$this->getHour(),
'minute' => (int)$this->getMinute(),
'second' => (int)$this->getSecond(),
'timezone' => $this->getTimezone(),
'isAllDay' => (bool)$this->getIsAllDay(),
);
}
public function setYear($year) {
$this->year = $year;
return $this;
}
public function getYear() {
return $this->year;
}
public function setMonth($month) {
$this->month = $month;
return $this;
}
public function getMonth() {
return $this->month;
}
public function setDay($day) {
$this->day = $day;
return $this;
}
public function getDay() {
return $this->day;
}
public function setHour($hour) {
$this->hour = $hour;
return $this;
}
public function getHour() {
return $this->hour;
}
public function setMinute($minute) {
$this->minute = $minute;
return $this;
}
public function getMinute() {
return $this->minute;
}
public function setSecond($second) {
$this->second = $second;
return $this;
}
public function getSecond() {
return $this->second;
}
public function setTimezone($timezone) {
$this->timezone = $timezone;
return $this;
}
public function getTimezone() {
return $this->timezone;
}
private function getEffectiveTimezone() {
- $zone = $this->getTimezone();
- if ($zone !== null) {
- return $zone;
+ $date_timezone = $this->getTimezone();
+ $viewer_timezone = $this->getViewerTimezone();
+
+ // Because all-day events are always "floating", the effective timezone
+ // is the viewer timezone if it is available. Otherwise, we'll return a
+ // DateTime object with the correct values, but it will be incorrectly
+ // adjusted forward or backward to the viewer's zone later.
+
+ $zones = array();
+ if ($this->getIsAllDay()) {
+ $zones[] = $viewer_timezone;
+ $zones[] = $date_timezone;
+ } else {
+ $zones[] = $date_timezone;
+ $zones[] = $viewer_timezone;
}
+ $zones = array_filter($zones);
- $zone = $this->getViewerTimezone();
- if ($zone !== null) {
- return $zone;
+ if (!$zones) {
+ throw new Exception(
+ pht(
+ 'Datetime has no timezone or viewer timezone.'));
}
- throw new Exception(
- pht(
- 'Datetime has no timezone or viewer timezone.'));
+ return head($zones);
}
public function newPHPDateTimeZone() {
$zone = $this->getEffectiveTimezone();
return new DateTimeZone($zone);
}
public function newPHPDateTime() {
$zone = $this->newPHPDateTimeZone();
$y = $this->getYear();
$m = $this->getMonth();
$d = $this->getDay();
if ($this->getIsAllDay()) {
$h = 0;
$i = 0;
$s = 0;
} else {
$h = $this->getHour();
$i = $this->getMinute();
$s = $this->getSecond();
}
$format = sprintf('%04d-%02d-%02d %02d:%02d:%02d', $y, $m, $d, $h, $i, $s);
return new DateTime($format, $zone);
}
public function newAbsoluteDateTime() {
return clone $this;
}
}
diff --git a/src/parser/calendar/ics/PhutilICSParser.php b/src/parser/calendar/ics/PhutilICSParser.php
index e473cad..015bea6 100644
--- a/src/parser/calendar/ics/PhutilICSParser.php
+++ b/src/parser/calendar/ics/PhutilICSParser.php
@@ -1,907 +1,919 @@
stack = array();
$this->node = null;
$this->cursor = null;
$this->warnings = array();
$lines = $this->unfoldICSLines($data);
$this->lines = $lines;
$root = $this->newICSNode('');
$this->stack[] = $root;
$this->node = $root;
foreach ($lines as $key => $line) {
$this->cursor = $key;
$matches = null;
if (preg_match('(^BEGIN:(.*)\z)', $line, $matches)) {
$this->beginParsingNode($matches[1]);
} else if (preg_match('(^END:(.*)\z)', $line, $matches)) {
$this->endParsingNode($matches[1]);
} else {
if (count($this->stack) < 2) {
$this->raiseParseFailure(
self::PARSE_ROOT_PROPERTY,
pht(
'Found unexpected property at ICS document root.'));
}
$this->parseICSProperty($line);
}
}
if (count($this->stack) > 1) {
$this->raiseParseFailure(
self::PARSE_MISSING_END,
pht(
'Expected all "BEGIN:" sections in ICS document to have '.
'corresponding "END:" sections.'));
}
$this->node = null;
$this->lines = null;
$this->cursor = null;
return $root;
}
private function getNode() {
return $this->node;
}
private function unfoldICSLines($data) {
$lines = phutil_split_lines($data, $retain_endings = false);
$this->lines = $lines;
// ICS files are wrapped at 75 characters, with overlong lines continued
// on the following line with an initial space or tab. Unwrap all of the
// lines in the file.
// This unwrapping is specifically byte-oriented, not character oriented,
// and RFC5545 anticipates that simple implementations may even split UTF8
// characters in the middle.
$last = null;
foreach ($lines as $idx => $line) {
$this->cursor = $idx;
if (!preg_match('/^[ \t]/', $line)) {
$last = $idx;
continue;
}
if ($last === null) {
$this->raiseParseFailure(
self::PARSE_INITIAL_UNFOLD,
pht(
'First line of ICS file begins with a space or tab, but this '.
'marks a line which should be unfolded.'));
}
$lines[$last] = $lines[$last].substr($line, 1);
unset($lines[$idx]);
}
return $lines;
}
private function beginParsingNode($type) {
$node = $this->getNode();
$new_node = $this->newICSNode($type);
if ($node instanceof PhutilCalendarContainerNode) {
$node->appendChild($new_node);
} else {
$this->raiseParseFailure(
self::PARSE_UNEXPECTED_CHILD,
pht(
'Found unexpected node "%s" inside node "%s".',
$new_node->getAttribute('ics.type'),
$node->getAttribute('ics.type')));
}
$this->stack[] = $new_node;
$this->node = $new_node;
return $this;
}
private function newICSNode($type) {
switch ($type) {
case '':
$node = new PhutilCalendarRootNode();
break;
case 'VCALENDAR':
$node = new PhutilCalendarDocumentNode();
break;
case 'VEVENT':
$node = new PhutilCalendarEventNode();
break;
default:
$node = new PhutilCalendarRawNode();
break;
}
$node->setAttribute('ics.type', $type);
return $node;
}
private function endParsingNode($type) {
$node = $this->getNode();
if ($node instanceof PhutilCalendarRootNode) {
$this->raiseParseFailure(
self::PARSE_EXTRA_END,
pht(
'Found unexpected "END" without a "BEGIN".'));
}
$old_type = $node->getAttribute('ics.type');
if ($old_type != $type) {
$this->raiseParseFailure(
self::PARSE_MISMATCHED_SECTIONS,
pht(
'Found mismatched "BEGIN" ("%s") and "END" ("%s") sections.',
$old_type,
$type));
}
array_pop($this->stack);
$this->node = last($this->stack);
return $this;
}
private function parseICSProperty($line) {
$matches = null;
// Properties begin with an alphanumeric name with no escaping, followed
// by either a ";" (to begin a list of parameters) or a ":" (to begin
// the actual field body).
$ok = preg_match('(^([A-Za-z0-9-]+)([;:])(.*)\z)', $line, $matches);
if (!$ok) {
$this->raiseParseFailure(
self::PARSE_MALFORMED_PROPERTY,
pht(
'Found malformed property in ICS document.'));
}
$name = $matches[1];
$body = $matches[3];
$has_parameters = ($matches[2] == ';');
$parameters = array();
if ($has_parameters) {
// Parameters are a sensible name, a literal "=", a pile of magic,
// and then maybe a comma and another parameter.
while (true) {
// We're going to get the first couple of parts first.
$ok = preg_match('(^([^=]+)=)', $body, $matches);
if (!$ok) {
$this->raiseParseFailure(
self::PARSE_MALFORMED_PARAMETER_NAME,
pht(
'Found malformed property in ICS document: %s',
$body));
}
$param_name = $matches[1];
$body = substr($body, strlen($matches[0]));
// Now we're going to match zero or more values.
$param_values = array();
while (true) {
// The value can either be a double-quoted string or an unquoted
// string, with some characters forbidden.
if (strlen($body) && $body[0] == '"') {
$is_quoted = true;
$ok = preg_match(
'(^"([^\x00-\x08\x10-\x19"]*)")',
$body,
$matches);
if (!$ok) {
$this->raiseParseFailure(
self::PARSE_MALFORMED_DOUBLE_QUOTE,
pht(
'Found malformed double-quoted string in ICS document '.
'parameter value.'));
}
} else {
$is_quoted = false;
// It's impossible for this not to match since it can match
// nothing, and it's valid for it to match nothing.
preg_match('(^([^\x00-\x08\x10-\x19";:,]*))', $body, $matches);
}
// NOTE: RFC5545 says "Property parameter values that are not in
// quoted-strings are case-insensitive." -- that is, the quoted and
// unquoted representations are not equivalent. Thus, preserve the
// original formatting in case we ever need to respect this.
$param_values[] = array(
'value' => $this->unescapeParameterValue($matches[1]),
'quoted' => $is_quoted,
);
$body = substr($body, strlen($matches[0]));
if (!strlen($body)) {
$this->raiseParseFailure(
self::PARSE_MISSING_VALUE,
pht(
'Expected ":" after parameters in ICS document property.'));
}
// If we have a comma now, we're going to read another value. Strip
// it off and keep going.
if ($body[0] == ',') {
$body = substr($body, 1);
continue;
}
// If we have a semicolon, we're going to read another parameter.
if ($body[0] == ';') {
break;
}
// If we have a colon, this is the last value and also the last
// property. Break, then handle the colon below.
if ($body[0] == ':') {
break;
}
$short_body = id(new PhutilUTF8StringTruncator())
->setMaximumGlyphs(32)
->truncateString($body);
// We aren't expecting anything else.
$this->raiseParseFailure(
self::PARSE_UNEXPECTED_TEXT,
pht(
'Found unexpected text ("%s") after reading parameter value.',
$short_body));
}
$parameters[] = array(
'name' => $param_name,
'values' => $param_values,
);
if ($body[0] == ';') {
$body = substr($body, 1);
continue;
}
if ($body[0] == ':') {
$body = substr($body, 1);
break;
}
}
}
$value = $this->unescapeFieldValue($name, $parameters, $body);
$node = $this->getNode();
$raw = $node->getAttribute('ics.properties', array());
$raw[] = array(
'name' => $name,
'parameters' => $parameters,
'value' => $value,
);
$node->setAttribute('ics.properties', $raw);
switch ($node->getAttribute('ics.type')) {
case 'VEVENT':
$this->didParseEventProperty($node, $name, $parameters, $value);
break;
}
}
private function unescapeParameterValue($data) {
// The parameter grammar is adjusted by RFC6868 to permit escaping with
// carets. Remove that escaping.
// This escaping is a bit weird because it's trying to be backwards
// compatible and the original spec didn't think about this and didn't
// provide much room to fix things.
$out = '';
$esc = false;
foreach (phutil_utf8v($data) as $c) {
if (!$esc) {
if ($c != '^') {
$out .= $c;
} else {
$esc = true;
}
} else {
switch ($c) {
case 'n':
$out .= "\n";
break;
case '^':
$out .= '^';
break;
case "'":
// NOTE: This is " " being decoded into a
// double quote!
$out .= '"';
break;
default:
// NOTE: The caret is NOT an escape for any other characters.
// This is a "MUST" requirement of RFC6868.
$out .= '^'.$c;
break;
}
}
}
// NOTE: Because caret on its own just means "caret" for backward
// compatibility, we don't warn if we're still in escaped mode once we
// reach the end of the string.
return $out;
}
private function unescapeFieldValue($name, array $parameters, $data) {
// NOTE: The encoding of the field value data is dependent on the field
// name (which defines a default encoding) and the parameters (which may
// include "VALUE", specifying a type of the data.
$default_types = array(
'CALSCALE' => 'TEXT',
'METHOD' => 'TEXT',
'PRODID' => 'TEXT',
'VERSION' => 'TEXT',
'ATTACH' => 'URI',
'CATEGORIES' => 'TEXT',
'CLASS' => 'TEXT',
'COMMENT' => 'TEXT',
'DESCRIPTION' => 'TEXT',
// TODO: The spec appears to contradict itself: it says that the value
// type is FLOAT, but it also says that this property value is actually
// two semicolon-separated values, which is not what FLOAT is defined as.
'GEO' => 'TEXT',
'LOCATION' => 'TEXT',
'PERCENT-COMPLETE' => 'INTEGER',
'PRIORITY' => 'INTEGER',
'RESOURCES' => 'TEXT',
'STATUS' => 'TEXT',
'SUMMARY' => 'TEXT',
'COMPLETED' => 'DATE-TIME',
'DTEND' => 'DATE-TIME',
'DUE' => 'DATE-TIME',
'DTSTART' => 'DATE-TIME',
'DURATION' => 'DURATION',
'FREEBUSY' => 'PERIOD',
'TRANSP' => 'TEXT',
'TZID' => 'TEXT',
'TZNAME' => 'TEXT',
'TZOFFSETFROM' => 'UTC-OFFSET',
'TZOFFSETTO' => 'UTC-OFFSET',
'TZURL' => 'URI',
'ATTENDEE' => 'CAL-ADDRESS',
'CONTACT' => 'TEXT',
'ORGANIZER' => 'CAL-ADDRESS',
'RECURRENCE-ID' => 'DATE-TIME',
'RELATED-TO' => 'TEXT',
'URL' => 'URI',
'UID' => 'TEXT',
'EXDATE' => 'DATE-TIME',
'RDATE' => 'DATE-TIME',
'RRULE' => 'RECUR',
'ACTION' => 'TEXT',
'REPEAT' => 'INTEGER',
'TRIGGER' => 'DURATION',
'CREATED' => 'DATE-TIME',
'DTSTAMP' => 'DATE-TIME',
'LAST-MODIFIED' => 'DATE-TIME',
'SEQUENCE' => 'INTEGER',
'REQUEST-STATUS' => 'TEXT',
);
$value_type = idx($default_types, $name, 'TEXT');
foreach ($parameters as $parameter) {
if ($parameter['name'] == 'VALUE') {
$value_type = idx(head($parameter['values']), 'value');
}
}
switch ($value_type) {
case 'BINARY':
$result = base64_decode($data, true);
if ($result === false) {
$this->raiseParseFailure(
self::PARSE_BAD_BASE64,
pht(
'Unable to decode base64 data: %s',
$data));
}
break;
case 'BOOLEAN':
$map = array(
'true' => true,
'false' => false,
);
$result = phutil_utf8_strtolower($data);
if (!isset($map[$result])) {
$this->raiseParseFailure(
self::PARSE_BAD_BOOLEAN,
pht(
'Unexpected BOOLEAN value "%s".',
$data));
}
$result = $map[$result];
break;
case 'CAL-ADDRESS':
$result = $data;
break;
case 'DATE':
// This is a comma-separated list of "YYYYMMDD" values.
$result = explode(',', $data);
break;
case 'DATE-TIME':
if (!strlen($data)) {
$result = array();
} else {
$result = explode(',', $data);
}
break;
case 'DURATION':
if (!strlen($data)) {
$result = array();
} else {
$result = explode(',', $data);
}
break;
case 'FLOAT':
$result = explode(',', $data);
foreach ($result as $k => $v) {
$result[$k] = (float)$v;
}
break;
case 'INTEGER':
$result = explode(',', $data);
foreach ($result as $k => $v) {
$result[$k] = (int)$v;
}
break;
case 'PERIOD':
$result = explode(',', $data);
break;
case 'RECUR':
$result = $data;
break;
case 'TEXT':
$result = $this->unescapeTextValue($data);
break;
case 'TIME':
$result = explode(',', $data);
break;
case 'URI':
$result = $data;
break;
case 'UTC-OFFSET':
$result = $data;
break;
default:
// RFC5545 says we MUST preserve the data for any types we don't
// recognize.
$result = $data;
break;
}
return array(
'type' => $value_type,
'value' => $result,
'raw' => $data,
);
}
private function unescapeTextValue($data) {
$result = array();
$buf = '';
$esc = false;
foreach (phutil_utf8v($data) as $c) {
if (!$esc) {
if ($c == '\\') {
$esc = true;
} else if ($c == ',') {
$result[] = $buf;
$buf = '';
} else {
$buf .= $c;
}
} else {
switch ($c) {
case 'n':
case 'N':
$buf .= "\n";
break;
default:
$buf .= $c;
break;
}
$esc = false;
}
}
if ($esc) {
$this->raiseParseFailure(
self::PARSE_UNESCAPED_BACKSLASH,
pht(
'ICS document contains TEXT value ending with unescaped '.
'backslash.'));
}
$result[] = $buf;
return $result;
}
private function raiseParseFailure($code, $message) {
if ($this->lines && isset($this->lines[$this->cursor])) {
$message = pht(
"ICS Parse Error near line %s:\n\n>>> %s\n\n%s",
$this->cursor + 1,
$this->lines[$this->cursor],
$message);
} else {
$message = pht(
'ICS Parse Error: %s',
$message);
}
throw id(new PhutilICSParserException($message))
->setParserFailureCode($code);
}
private function raiseWarning($code, $message) {
$this->warnings[] = array(
'code' => $code,
'line' => $this->cursor,
'text' => $this->lines[$this->cursor],
'message' => $message,
);
return $this;
}
public function getWarnings() {
return $this->warnings;
}
private function didParseEventProperty(
PhutilCalendarEventNode $node,
$name,
array $parameters,
array $value) {
switch ($name) {
case 'UID':
$text = $this->newTextFromProperty($parameters, $value);
$node->setUID($text);
break;
case 'CREATED':
$datetime = $this->newDateTimeFromProperty($parameters, $value);
$node->setCreatedDateTime($datetime);
break;
case 'DTSTAMP':
$datetime = $this->newDateTimeFromProperty($parameters, $value);
$node->setModifiedDateTime($datetime);
break;
case 'SUMMARY':
$text = $this->newTextFromProperty($parameters, $value);
$node->setName($text);
break;
case 'DESCRIPTION':
$text = $this->newTextFromProperty($parameters, $value);
$node->setDescription($text);
break;
case 'DTSTART':
$datetime = $this->newDateTimeFromProperty($parameters, $value);
$node->setStartDateTime($datetime);
break;
case 'DTEND':
$datetime = $this->newDateTimeFromProperty($parameters, $value);
$node->setEndDateTime($datetime);
break;
case 'DURATION':
$duration = $this->newDurationFromProperty($parameters, $value);
$node->setDuration($duration);
break;
case 'RRULE':
$rrule = $this->newRecurrenceRuleFromProperty($parameters, $value);
$node->setRecurrenceRule($rrule);
break;
case 'RECURRENCE-ID':
$text = $this->newTextFromProperty($parameters, $value);
$node->setRecurrenceID($text);
break;
case 'ATTENDEE':
$attendee = $this->newAttendeeFromProperty($parameters, $value);
$node->addAttendee($attendee);
break;
}
}
private function newTextFromProperty(array $parameters, array $value) {
$value = $value['value'];
return implode("\n\n", $value);
}
private function newAttendeeFromProperty(array $parameters, array $value) {
$uri = $value['value'];
switch (idx($parameters, 'PARTSTAT')) {
case 'ACCEPTED':
$status = PhutilCalendarUserNode::STATUS_ACCEPTED;
break;
case 'DECLINED':
$status = PhutilCalendarUserNode::STATUS_DECLINED;
break;
case 'NEEDS-ACTION':
default:
$status = PhutilCalendarUserNode::STATUS_INVITED;
break;
}
$name = $this->getScalarParameterValue($parameters, 'CN');
return id(new PhutilCalendarUserNode())
->setURI($uri)
->setName($name)
->setStatus($status);
}
private function newDateTimeFromProperty(array $parameters, array $value) {
$value = $value['value'];
if (!$value) {
$this->raiseParseFailure(
self::PARSE_EMPTY_DATETIME,
pht(
'Expected DATE-TIME to have exactly one value, found none.'));
}
if (count($value) > 1) {
$this->raiseParseFailure(
self::PARSE_MANY_DATETIME,
pht(
'Expected DATE-TIME to have exactly one value, found more than '.
'one.'));
}
$value = head($value);
$tzid = $this->getScalarParameterValue($parameters, 'TZID');
if (preg_match('/Z\z/', $value)) {
if ($tzid) {
$this->raiseWarning(
self::WARN_TZID_UTC,
pht(
'DATE-TIME "%s" uses "Z" to specify UTC, but also has a TZID '.
'parameter with value "%s". This violates RFC5545. The TZID '.
'will be ignored, and the value will be interpreted as UTC.',
$value,
$tzid));
}
$tzid = 'UTC';
} else if ($tzid !== null) {
$tzid = $this->guessTimezone($tzid);
}
try {
$datetime = PhutilCalendarAbsoluteDateTime::newFromISO8601(
$value,
$tzid);
} catch (Exception $ex) {
$this->raiseParseFailure(
self::PARSE_BAD_DATETIME,
pht(
'Error parsing DATE-TIME: %s',
$ex->getMessage()));
}
return $datetime;
}
private function newDurationFromProperty(array $parameters, array $value) {
$value = $value['value'];
if (!$value) {
$this->raiseParseFailure(
self::PARSE_EMPTY_DURATION,
pht(
'Expected DURATION to have exactly one value, found none.'));
}
if (count($value) > 1) {
$this->raiseParseFailure(
self::PARSE_MANY_DURATION,
pht(
'Expected DURATION to have exactly one value, found more than '.
'one.'));
}
$value = head($value);
try {
$duration = PhutilCalendarDuration::newFromISO8601($value);
} catch (Exception $ex) {
$this->raiseParseFailure(
self::PARSE_BAD_DURATION,
pht(
'Invalid DURATION: %s',
$ex->getMessage()));
}
return $duration;
}
private function newRecurrenceRuleFromProperty(array $parameters, $value) {
return PhutilCalendarRecurrenceRule::newFromRRULE($value['value']);
}
private function getScalarParameterValue(
array $parameters,
$name,
$default = null) {
$match = null;
foreach ($parameters as $parameter) {
if ($parameter['name'] == $name) {
$match = $parameter;
}
}
if ($match === null) {
return $default;
}
$value = $match['values'];
if (!$value) {
// Parameter is specified, but with no value, like "KEY=". Just return
// the default, as though the parameter was not specified.
return $default;
}
if (count($value) > 1) {
$this->raiseParseFailure(
self::PARSE_MULTIPLE_PARAMETERS,
pht(
'Expected parameter "%s" to have at most one value, but found '.
'more than one.',
$name));
}
return idx(head($value), 'value');
}
private function guessTimezone($tzid) {
$map = DateTimeZone::listIdentifiers();
$map = array_fuse($map);
if (isset($map[$tzid])) {
// This is a real timezone we recognize, so just use it as provided.
return $tzid;
}
// These are alternate names for timezones.
- $aliases = array(
- 'Etc/GMT' => 'UTC',
- );
+ static $aliases;
+
+ if ($aliases === null) {
+ $aliases = array(
+ 'Etc/GMT' => 'UTC',
+ );
+
+ // Load the map of Windows timezones.
+ $root_path = dirname(phutil_get_library_root('phutil'));
+ $windows_path = $root_path.'/resources/timezones/windows_timezones.json';
+ $windows_data = Filesystem::readFile($windows_path);
+ $windows_zones = phutil_json_decode($windows_data);
+
+ $aliases = $aliases + $windows_zones;
+ }
if (isset($aliases[$tzid])) {
return $aliases[$tzid];
}
// Look for something that looks like "UTC+3" or "GMT -05.00". If we find
- // anything
+ // anything, pick a timezone with that offset.
$offset_pattern =
'/'.
'(?:UTC|GMT)'.
'\s*'.
'(?P[+-])'.
'\s*'.
'(?P\d+)'.
'(?:'.
'[:.](?P\d+)'.
')?'.
'/i';
$matches = null;
if (preg_match($offset_pattern, $tzid, $matches)) {
$hours = (int)$matches['h'];
$minutes = (int)idx($matches, 'm');
$offset = ($hours * 60 * 60) + ($minutes * 60);
if (idx($matches, 'sign') == '-') {
$offset = -$offset;
}
// NOTE: We could possibly do better than this, by using the event start
// time to guess a timezone. However, that won't work for recurring
// events and would require us to do this work after finishing initial
// parsing. Since these unusual offset-based timezones appear to be rare,
// the benefit may not be worth the complexity.
$now = new DateTime('@'.time());
foreach ($map as $identifier) {
$zone = new DateTimeZone($identifier);
if ($zone->getOffset($now) == $offset) {
$this->raiseWarning(
self::WARN_TZID_GUESS,
pht(
'TZID "%s" is unknown, guessing "%s" based on pattern "%s".',
$tzid,
$identifier,
$matches[0]));
return $identifier;
}
}
}
$this->raiseWarning(
self::WARN_TZID_IGNORED,
pht(
'TZID "%s" is unknown, using UTC instead.',
$tzid));
return 'UTC';
}
}
diff --git a/src/utils/PhutilEditDistanceMatrix.php b/src/utils/PhutilEditDistanceMatrix.php
index 1dd21f7..6ad5ab0 100644
--- a/src/utils/PhutilEditDistanceMatrix.php
+++ b/src/utils/PhutilEditDistanceMatrix.php
@@ -1,555 +1,562 @@
setSequences(str_split('ran'), str_split('rat'));
*
* $cost = $matrix->getEditDistance();
*
* Edit distance computation is slow and requires a large amount of memory;
* both are roughly O(N * M) in the length of the strings.
*
* You can customize the cost of insertions, deletions and replacements. By
* default, each has a cost of 1.
*
* $matrix->setReplaceCost(1);
*
* By default, transpositions are not evaluated (i.e., the algorithm is
* Levenshtein). You can cause transpositions to be evaluated by setting a
* transpose cost (which will change the algorithm to Damerau-Levenshtein):
*
* $matrix->setTransposeCost(1);
*
* You can also provide a cost to alter the type of operation being applied.
* Many strings have several equivalently expensive edit sequences, but one
* some sequences are more readable to humans than others. Providing a small
* cost to alter operation type tends to smooth out the sequence and produce
* long runs of a single operation, which are generally more readable. For
* example, these strings:
*
* (x)
* ((x))
*
* ...have edit strings "issis" and "isssi", which describe edit operations with
* the same cost, but the latter string is more comprehensible to human viewers.
*
* If you set an alter cost, you must call @{method:setComputeString} to enable
* type computation. The alter cost should generally be smaller than `c / N`,
* where `c` is the smallest operational cost and `N` is the length of the
* longest string. For example, if you are using the default costs (insert = 1,
* delete = 1, replace = 1) and computing edit distances for strings of fewer
* than 1,000 characters, you might set the alter cost to 0.001.
*/
final class PhutilEditDistanceMatrix extends Phobject {
private $insertCost = 1;
private $deleteCost = 1;
private $replaceCost = 1;
private $transposeCost = null;
private $alterCost = 0;
private $maximumLength;
private $computeString;
private $applySmoothing = self::SMOOTHING_NONE;
+ private $reachedMaximumLength;
private $x;
private $y;
private $prefix;
private $suffix;
private $distanceMatrix = null;
private $typeMatrix = null;
const SMOOTHING_NONE = 'none';
const SMOOTHING_INTERNAL = 'internal';
const SMOOTHING_FULL = 'full';
public function setMaximumLength($maximum_length) {
$this->maximumLength = $maximum_length;
return $this;
}
public function getMaximumLength() {
return coalesce($this->maximumLength, $this->getInfinity());
}
+ public function didReachMaximumLength() {
+ return $this->reachedMaximumLength;
+ }
+
public function setComputeString($compute_string) {
$this->computeString = $compute_string;
return $this;
}
public function getComputeString() {
return $this->computeString;
}
public function setTransposeCost($transpose_cost) {
$this->transposeCost = $transpose_cost;
return $this;
}
public function getTransposeCost() {
return $this->transposeCost;
}
public function setReplaceCost($replace_cost) {
$this->replaceCost = $replace_cost;
return $this;
}
public function getReplaceCost() {
return $this->replaceCost;
}
public function setDeleteCost($delete_cost) {
$this->deleteCost = $delete_cost;
return $this;
}
public function getDeleteCost() {
return $this->deleteCost;
}
public function setInsertCost($insert_cost) {
$this->insertCost = $insert_cost;
return $this;
}
public function getInsertCost() {
return $this->insertCost;
}
public function setAlterCost($alter_cost) {
$this->alterCost = $alter_cost;
return $this;
}
public function getAlterCost() {
return $this->alterCost;
}
public function setApplySmoothing($apply_smoothing) {
$this->applySmoothing = $apply_smoothing;
return $this;
}
public function getApplySmoothing() {
return $this->applySmoothing;
}
public function setSequences(array $x, array $y) {
// NOTE: We strip common prefixes and suffixes from the inputs because
// the runtime of the edit distance algorithm is large and it is common
// to diff similar strings.
$xl = count($x);
$yl = count($y);
$l = min($xl, $yl);
$prefix_l = 0;
$suffix_l = 0;
for ($ii = 0; $ii < $l; $ii++) {
if ($x[$ii] !== $y[$ii]) {
break;
}
$prefix_l++;
}
for ($ii = 1; $ii <= ($l - $prefix_l); $ii++) {
if ($x[$xl - $ii] !== $y[$yl - $ii]) {
break;
}
$suffix_l++;
}
$this->prefix = array_slice($x, 0, $prefix_l);
$this->suffix = array_slice($x, $xl - $suffix_l);
$this->x = array_slice($x, $prefix_l, $xl - ($suffix_l + $prefix_l));
$this->y = array_slice($y, $prefix_l, $yl - ($suffix_l + $prefix_l));
$this->distanceMatrix = null;
return $this;
}
private function requireSequences() {
if ($this->x === null) {
throw new PhutilInvalidStateException('setSequences');
}
}
public function getEditDistance() {
$this->requireSequences();
$x = $this->x;
$y = $this->y;
if (!$x) {
return $this->insertCost * count($y);
}
if (!$y) {
return $this->deleteCost * count($x);
}
$max = $this->getMaximumLength();
if (count($x) > $max || count($y) > $max) {
+ $this->reachedMaximumLength = true;
return ($this->insertCost * count($y)) + ($this->deleteCost * count($x));
}
if ($x === $y) {
return 0;
}
$matrix = $this->getDistanceMatrix();
return $matrix[count($x)][count($y)];
}
/**
* Return a string representing the edits between the sequences. The string
* has these characters:
*
* - s (same): Same character in both strings.
* - i (insert): Character inserted.
* - d (delete): Character deleted.
* - x (replace): Character replaced.
* - t (transpose): Character transposed.
*/
public function getEditString() {
$this->requireSequences();
$x = $this->x;
$y = $this->y;
if (!$x && !$y) {
return $this->padEditString('');
}
if (!$x) {
return $this->padEditString(str_repeat('i', count($y)));
}
if (!$y) {
return $this->padEditString(str_repeat('d', count($x)));
}
if ($x === $y) {
return $this->padEditString(str_repeat('s', count($x)));
}
$max = $this->getMaximumLength();
if (count($x) > $max || count($y) > $max) {
+ $this->reachedMaximumLength = true;
return $this->padEditString(
str_repeat('d', count($x)).
str_repeat('i', count($y)));
}
$matrix = $this->getTypeMatrix();
$xx = count($x);
$yy = count($y);
$transposes = array();
$str = '';
while (true) {
$type = $matrix[$xx][$yy];
if (is_array($type)) {
$chr = 't';
$transposes[$type[0]][$type[1]] = true;
$type = $type[2];
} else {
$chr = $type;
}
if (isset($transposes[$xx][$yy])) {
$chr = 't';
}
if ($type == 's' || $type == 'x') {
$xx -= 1;
$yy -= 1;
} else if ($type == 'i') {
$yy -= 1;
} else if ($type == 'd') {
$xx -= 1;
} else {
throw new Exception(pht("Unknown type '%s' in type matrix.", $type));
}
$str .= $chr;
if ($xx <= 0 && $yy <= 0) {
break;
}
}
$str = strrev($str);
// For full smoothing, we pad the edit string before smoothing it, so
// ranges of similar characters at the beginning or end of the string can
// also be smoothed.
// For internal smoothing, we only smooth ranges within the change itself.
$smoothing = $this->getApplySmoothing();
switch ($smoothing) {
case self::SMOOTHING_FULL:
$str = $this->padEditString($str);
$str = $this->applySmoothing($str, true);
break;
case self::SMOOTHING_INTERNAL:
$str = $this->applySmoothing($str, false);
$str = $this->padEditString($str);
break;
case self::SMOOTHING_NONE:
$str = $this->padEditString($str);
break;
default:
throw new Exception(
pht(
'Unknown smoothing type "%s".',
$smoothing));
}
return $str;
}
private function padEditString($str) {
if ($this->prefix) {
$str = str_repeat('s', count($this->prefix)).$str;
}
if ($this->suffix) {
$str = $str.str_repeat('s', count($this->suffix));
}
return $str;
}
private function getTypeMatrix() {
if (!$this->computeString) {
throw new PhutilInvalidStateException('setComputeString');
}
if ($this->typeMatrix === null) {
$this->computeMatrix($this->x, $this->y);
}
return $this->typeMatrix;
}
private function getDistanceMatrix() {
if ($this->distanceMatrix === null) {
$this->computeMatrix($this->x, $this->y);
}
return $this->distanceMatrix;
}
private function computeMatrix(array $x, array $y) {
$xl = count($x);
$yl = count($y);
$m = array();
$infinity = $this->getInfinity();
$use_damerau = ($this->transposeCost !== null);
if ($use_damerau) {
// Initialize the default cost of a transpose.
$m[-1][-1] = $infinity;
// Initialize the character position dictionary for Damerau.
$d = array();
foreach ($x as $c) {
$d[$c] = -1;
}
foreach ($y as $c) {
$d[$c] = -1;
}
// Initialize the transpose costs for Damerau.
for ($xx = 0; $xx <= $xl; $xx++) {
$m[$xx][-1] = $infinity;
}
for ($yy = 0; $yy <= $yl; $yy++) {
$m[-1][$yy] = $infinity;
}
}
// Initialize the top row of the matrix.
for ($xx = 0; $xx <= $xl; $xx++) {
$m[$xx][0] = $xx * $this->deleteCost;
}
// Initialize the left column of the matrix.
$cost = 0;
for ($yy = 0; $yy <= $yl; $yy++) {
$m[0][$yy] = $yy * $this->insertCost;
}
$use_types = ($this->computeString);
if ($use_types) {
$t = array();
for ($xx = 0; $xx <= $xl; $xx++) {
$t[$xx][0] = 'd';
}
for ($yy = 0; $yy <= $yl; $yy++) {
$t[0][$yy] = 'i';
}
$t[0][0] = 's';
}
$alt_cost = $this->getAlterCost();
if ($alt_cost && !$use_types) {
throw new Exception(
pht(
'If you provide an alter cost with %s, you must enable '.
'type computation with %s.',
'setAlterCost()',
'setComputeStrings()'));
}
// Build the edit distance matrix.
for ($xx = 1; $xx <= $xl; $xx++) {
$last_dy = -1;
for ($yy = 1; $yy <= $yl; $yy++) {
if ($use_damerau) {
$dx = $d[$y[$yy - 1]];
$dy = $last_dy;
}
if ($x[$xx - 1] === $y[$yy - 1]) {
$rep_cost = $m[$xx - 1][$yy - 1] + 0;
$rep_type = 's';
} else {
$rep_cost = $m[$xx - 1][$yy - 1] + $this->replaceCost;
$rep_type = 'x';
}
$del_cost = $m[$xx - 1][$yy ] + $this->deleteCost;
$ins_cost = $m[$xx ][$yy - 1] + $this->insertCost;
if ($alt_cost) {
$del_char = $t[$xx - 1][$yy ];
$ins_char = $t[$xx ][$yy - 1];
$rep_char = $t[$xx - 1][$yy - 1];
if ($del_char !== 'd') {
$del_cost += $alt_cost;
}
if ($ins_char !== 'i') {
$ins_cost += $alt_cost;
}
if ($rep_char !== $rep_type) {
$rep_cost += $alt_cost;
}
}
if ($rep_cost <= $del_cost && $rep_cost <= $ins_cost) {
$cost = $rep_cost;
$type = $rep_type;
if ($rep_type === 's') {
$last_dy = $yy - 1;
}
} else if ($ins_cost <= $del_cost) {
$cost = $ins_cost;
$type = 'i';
} else {
$cost = $del_cost;
$type = 'd';
}
if ($use_damerau) {
$trn_count = (($xx - $dx - 2) + ($yy - $dy - 2) + 1);
$trn_cost = $m[$dx][$dy] + ($trn_count * $this->transposeCost);
if ($trn_cost < $cost) {
$cost = $trn_cost;
$type = array($dx + 1, $dy + 1, $type);
}
}
$m[$xx][$yy] = $cost;
if ($use_types) {
$t[$xx][$yy] = $type;
}
}
if ($use_damerau) {
$d[$x[$xx - 1]] = ($xx - 1);
}
}
$this->distanceMatrix = $m;
if ($use_types) {
$this->typeMatrix = $t;
}
}
private function getInfinity() {
return INF;
}
private function printMatrix(array $m) {
$x = $this->x;
$y = $this->y;
$p = '% 8s ';
printf($p, ' ');
foreach (head($m) as $yk => $yv) {
printf($p, idx($y, $yk - 1, '-'));
}
echo "\n";
foreach ($m as $xk => $xv) {
printf($p, idx($x, $xk - 1, '-'));
foreach ($xv as $yk => $yv) {
printf($p, ($yv == $this->getInfinity() ? 'inf' : $yv));
}
echo "\n";
}
}
private function printTypeMatrix(array $t) {
$x = $this->x;
$y = $this->y;
$p = '% 8s ';
printf($p, ' ');
foreach (head($t) as $yk => $yv) {
printf($p, idx($y, $yk - 1, '-'));
}
echo "\n";
foreach ($t as $xk => $xv) {
printf($p, idx($x, $xk - 1, '-'));
foreach ($xv as $yk => $yv) {
printf($p, ($yv == $this->getInfinity() ? 'inf' : $yv));
}
echo "\n";
}
}
private function applySmoothing($str, $full) {
if ($full) {
$prefix = '(^|[xdi])';
$suffix = '([xdi]|\z)';
} else {
$prefix = '([xdi])';
$suffix = '([xdi])';
}
// Smooth the string out, by replacing short runs of similar characters
// with 'x' operations. This makes the result more readable to humans,
// since there are fewer choppy runs of short added and removed substrings.
do {
$original = $str;
$str = preg_replace('/'.$prefix.'(s{3})'.$suffix.'/', '$1xxx$3', $str);
$str = preg_replace('/'.$prefix.'(s{2})'.$suffix.'/', '$1xx$3', $str);
$str = preg_replace('/'.$prefix.'(s{1})'.$suffix.'/', '$1x$3', $str);
} while ($str != $original);
return $str;
}
}
diff --git a/src/utils/PhutilProseDiff.php b/src/utils/PhutilProseDiff.php
index 5507e52..df34d64 100644
--- a/src/utils/PhutilProseDiff.php
+++ b/src/utils/PhutilProseDiff.php
@@ -1,286 +1,292 @@
parts[] = array(
'type' => $type,
'text' => $text,
);
return $this;
}
public function getParts() {
return $this->parts;
}
/**
* Get diff parts, but replace large blocks of unchanged text with "."
* parts representing missing context.
*/
public function getSummaryParts() {
$parts = $this->getParts();
$head_key = head_key($parts);
$last_key = last_key($parts);
$results = array();
foreach ($parts as $key => $part) {
$is_head = ($key == $head_key);
$is_last = ($key == $last_key);
switch ($part['type']) {
case '=':
$pieces = $this->splitTextForSummary($part['text']);
if ($is_head || $is_last) {
$need = 2;
} else {
$need = 3;
}
// We don't have enough pieces to omit anything, so just continue.
if (count($pieces) < $need) {
$results[] = $part;
break;
}
if (!$is_head) {
$results[] = array(
'type' => '=',
'text' => head($pieces),
);
}
$results[] = array(
'type' => '.',
'text' => null,
);
if (!$is_last) {
$results[] = array(
'type' => '=',
'text' => last($pieces),
);
}
break;
default:
$results[] = $part;
break;
}
}
return $results;
}
public function reorderParts() {
// Reorder sequences of removed and added sections to put all the "-"
// parts together first, then all the "+" parts together. This produces
// a more human-readable result than intermingling them.
$o_run = array();
$n_run = array();
$result = array();
foreach ($this->parts as $part) {
$type = $part['type'];
switch ($type) {
case '-':
$o_run[] = $part;
break;
case '+':
$n_run[] = $part;
break;
default:
if ($o_run || $n_run) {
foreach ($this->combineRuns($o_run, $n_run) as $merged_part) {
$result[] = $merged_part;
}
$o_run = array();
$n_run = array();
}
$result[] = $part;
break;
}
}
if ($o_run || $n_run) {
foreach ($this->combineRuns($o_run, $n_run) as $part) {
$result[] = $part;
}
}
// Now, combine consecuitive runs of the same type of change (like a
// series of "-" parts) into a single run.
$combined = array();
$last = null;
$last_text = null;
foreach ($result as $part) {
$type = $part['type'];
if ($last !== $type) {
if ($last !== null) {
$combined[] = array(
'type' => $last,
'text' => $last_text,
);
}
$last_text = null;
$last = $type;
}
$last_text .= $part['text'];
}
if ($last_text !== null) {
$combined[] = array(
'type' => $last,
'text' => $last_text,
);
}
$this->parts = $combined;
return $this;
}
private function combineRuns($o_run, $n_run) {
$o_merge = $this->mergeParts($o_run);
$n_merge = $this->mergeParts($n_run);
// When removed and added blocks share a prefix or suffix, we sometimes
// want to count it as unchanged (for example, if it is whitespace) but
// sometimes want to count it as changed (for example, if it is a word
// suffix like "ing"). Find common prefixes and suffixes of these layout
// characters and emit them as "=" (unchanged) blocks.
$layout_characters = array(
' ' => true,
"\n" => true,
'.' => true,
'!' => true,
',' => true,
'?' => true,
+ ']' => true,
+ '[' => true,
+ '(' => true,
+ ')' => true,
+ '<' => true,
+ '>' => true,
);
$o_text = $o_merge['text'];
$n_text = $n_merge['text'];
$o_len = strlen($o_text);
$n_len = strlen($n_text);
$min_len = min($o_len, $n_len);
$prefix_len = 0;
for ($pos = 0; $pos < $min_len; $pos++) {
$o = $o_text[$pos];
$n = $n_text[$pos];
if ($o !== $n) {
break;
}
if (empty($layout_characters[$o])) {
break;
}
$prefix_len++;
}
$suffix_len = 0;
for ($pos = 0; $pos < ($min_len - $prefix_len); $pos++) {
$o = $o_text[$o_len - ($pos + 1)];
$n = $n_text[$n_len - ($pos + 1)];
if ($o !== $n) {
break;
}
if (empty($layout_characters[$o])) {
break;
}
$suffix_len++;
}
$results = array();
if ($prefix_len) {
$results[] = array(
'type' => '=',
'text' => substr($o_text, 0, $prefix_len),
);
}
if ($prefix_len < $o_len) {
$results[] = array(
'type' => '-',
'text' => substr(
$o_text,
$prefix_len,
$o_len - $prefix_len - $suffix_len),
);
}
if ($prefix_len < $n_len) {
$results[] = array(
'type' => '+',
'text' => substr(
$n_text,
$prefix_len,
$n_len - $prefix_len - $suffix_len),
);
}
if ($suffix_len) {
$results[] = array(
'type' => '=',
'text' => substr($o_text, -$suffix_len),
);
}
return $results;
}
private function mergeParts(array $parts) {
$text = '';
$type = null;
foreach ($parts as $part) {
$part_type = $part['type'];
if ($type === null) {
$type = $part_type;
}
if ($type !== $part_type) {
throw new Exception(pht('Can not merge parts of dissimilar types!'));
}
$text .= $part['text'];
}
return array(
'type' => $type,
'text' => $text,
);
}
private function splitTextForSummary($text) {
$matches = null;
$ok = preg_match('/^(\n*[^\n]+)\n/', $text, $matches);
if (!$ok) {
return array($text);
}
$head = $matches[1];
$text = substr($text, strlen($head));
$ok = preg_match('/\n([^\n]+\n*)\z/', $text, $matches);
if (!$ok) {
return array($text);
}
$last = $matches[1];
$text = substr($text, 0, -strlen($last));
if (!strlen(trim($text))) {
return array($head, $last);
} else {
return array($head, $text, $last);
}
}
}
diff --git a/src/utils/PhutilProseDifferenceEngine.php b/src/utils/PhutilProseDifferenceEngine.php
index 6000f1a..5deb8e6 100644
--- a/src/utils/PhutilProseDifferenceEngine.php
+++ b/src/utils/PhutilProseDifferenceEngine.php
@@ -1,199 +1,209 @@
buildDiff($u, $v, 1);
+ return $this->buildDiff($u, $v, 0);
}
private function buildDiff($u, $v, $level) {
$u_parts = $this->splitCorpus($u, $level);
$v_parts = $this->splitCorpus($v, $level);
$matrix = id(new PhutilEditDistanceMatrix())
->setMaximumLength(128)
->setSequences($u_parts, $v_parts)
->setComputeString(true);
// For word-level and character-level changes, smooth the output string
// to reduce the choppiness of the diff.
if ($level > 1) {
$matrix->setApplySmoothing(PhutilEditDistanceMatrix::SMOOTHING_FULL);
}
$u_pos = 0;
$v_pos = 0;
$edits = $matrix->getEditString();
$edits_length = strlen($edits);
$diff = new PhutilProseDiff();
for ($ii = 0; $ii < $edits_length; $ii++) {
$c = $edits[$ii];
if ($c == 's') {
$diff->addPart('=', $u_parts[$u_pos]);
$u_pos++;
$v_pos++;
} else if ($c == 'd') {
$diff->addPart('-', $u_parts[$u_pos]);
$u_pos++;
} else if ($c == 'i') {
$diff->addPart('+', $v_parts[$v_pos]);
$v_pos++;
} else if ($c == 'x') {
$diff->addPart('-', $u_parts[$u_pos]);
$diff->addPart('+', $v_parts[$v_pos]);
$u_pos++;
$v_pos++;
} else {
throw new Exception(
pht(
'Unexpected character ("%s") in edit string.',
$c));
}
}
$diff->reorderParts();
// If we just built a character-level diff, we're all done and do not
// need to go any deeper.
if ($level == 3) {
return $diff;
}
$blocks = array();
$block = null;
foreach ($diff->getParts() as $part) {
$type = $part['type'];
$text = $part['text'];
switch ($type) {
case '=':
if ($block) {
$blocks[] = $block;
$block = null;
}
$blocks[] = array(
'type' => $type,
'text' => $text,
);
break;
case '-':
if (!$block) {
$block = array(
'type' => '!',
'old' => '',
'new' => '',
);
}
$block['old'] .= $text;
break;
case '+':
if (!$block) {
$block = array(
'type' => '!',
'old' => '',
'new' => '',
);
}
$block['new'] .= $text;
break;
}
}
if ($block) {
$blocks[] = $block;
}
$result = new PhutilProseDiff();
foreach ($blocks as $block) {
$type = $block['type'];
if ($type == '=') {
$result->addPart('=', $block['text']);
} else {
$old = $block['old'];
$new = $block['new'];
if (!strlen($old) && !strlen($new)) {
// Nothing to do.
} else if (!strlen($old)) {
$result->addPart('+', $new);
} else if (!strlen($new)) {
$result->addPart('-', $old);
} else {
- $subdiff = $this->buildDiff(
- $old,
- $new,
- $level + 1);
-
- foreach ($subdiff->getParts() as $part) {
- $result->addPart($part['type'], $part['text']);
+ if ($matrix->didReachMaximumLength()) {
+ // If this text was too big to diff, don't try to subdivide it.
+ $result->addPart('-', $old);
+ $result->addPart('+', $new);
+ } else {
+ $subdiff = $this->buildDiff(
+ $old,
+ $new,
+ $level + 1);
+
+ foreach ($subdiff->getParts() as $part) {
+ $result->addPart($part['type'], $part['text']);
+ }
}
}
}
}
$result->reorderParts();
return $result;
}
private function splitCorpus($corpus, $level) {
switch ($level) {
+ case 0:
+ // Level 0: Split into paragraphs.
+ $expr = '/([\n]+)/';
+ break;
case 1:
// Level 1: Split into sentences.
$expr = '/([\n,!;?\.]+)/';
break;
case 2:
// Level 2: Split into words.
$expr = '/(\s+)/';
break;
case 3:
// Level 3: Split into characters.
return phutil_utf8v_combined($corpus);
}
$pieces = preg_split($expr, $corpus, -1, PREG_SPLIT_DELIM_CAPTURE);
return $this->stitchPieces($pieces, $level);
}
private function stitchPieces(array $pieces, $level) {
$results = array();
$count = count($pieces);
for ($ii = 0; $ii < $count; $ii += 2) {
$result = $pieces[$ii];
if ($ii + 1 < $count) {
$result .= $pieces[$ii + 1];
}
- if ($level == 1) {
+ if ($level < 2) {
// Split pieces into separate text and whitespace sections: make one
// piece out of all the whitespace at the beginning, one piece out of
// all the actual text in the middle, and one piece out of all the
// whitespace at the end.
$matches = null;
preg_match('/^(\s*)(.*?)(\s*)\z/', $result, $matches);
if (strlen($matches[1])) {
$results[] = $matches[1];
}
if (strlen($matches[2])) {
$results[] = $matches[2];
}
if (strlen($matches[3])) {
$results[] = $matches[3];
}
} else {
$results[] = $result;
}
}
// If the input ended with a delimiter, we can get an empty final piece.
// Just discard it.
if (last($results) == '') {
array_pop($results);
}
return $results;
}
}
diff --git a/src/utils/__tests__/PhutilProseDiffTestCase.php b/src/utils/__tests__/PhutilProseDiffTestCase.php
index 8d40484..bf0375e 100644
--- a/src/utils/__tests__/PhutilProseDiffTestCase.php
+++ b/src/utils/__tests__/PhutilProseDiffTestCase.php
@@ -1,204 +1,245 @@
assertProseParts(
'',
'',
array(),
pht('Empty'));
$this->assertProseParts(
"xxx\nyyy",
"xxx\nzzz\nyyy",
array(
"= xxx\n",
"+ zzz\n",
'= yyy',
),
pht('Add Paragraph'));
$this->assertProseParts(
"xxx\nzzz\nyyy",
"xxx\nyyy",
array(
"= xxx\n",
"- zzz\n",
'= yyy',
),
pht('Remove Paragraph'));
// Without smoothing, the alogorithm identifies that "shark" and "cat"
// both contain the letter "a" and tries to express this as a very
// fine-grained edit which replaces "sh" with "c" and then "rk" with "t".
// This is technically correct, but it is much easier for human viewers to
// parse if we smooth this into a single removal and a single addition.
$this->assertProseParts(
'They say the shark has nine lives.',
'They say the cat has nine lives.',
array(
'= They say the ',
'- shark',
'+ cat',
'= has nine lives.',
),
pht('"Shark/cat" word edit smoothenss.'));
$this->assertProseParts(
'Rising quickly, she says',
'Rising quickly, she remarks:',
array(
'= Rising quickly, she ',
'- says',
'+ remarks:',
),
pht('"Says/remarks" word edit smoothenss.'));
$this->assertProseParts(
'See screenshots',
'Viewed video files',
array(
'- See screenshots',
'+ Viewed video files',
),
pht('Complete paragraph rewrite.'));
$this->assertProseParts(
'xaaax',
'xbbbx',
array(
'- xaaax',
'+ xbbbx',
),
pht('Whole word rewrite with common prefix and suffix.'));
$this->assertProseParts(
' aaa ',
' bbb ',
array(
'= ',
'- aaa',
'+ bbb',
'= ',
),
pht('Whole word rewrite with whitespace prefix and suffix.'));
$this->assertSummaryProseParts(
"a\nb\nc\nd\ne\nf\ng\nh\n",
"a\nb\nc\nd\nX\nf\ng\nh\n",
array(
'.',
"= d\n",
'- e',
'+ X',
"= \nf",
'.',
),
pht('Summary diff with middle change.'));
$this->assertSummaryProseParts(
"a\nb\nc\nd\ne\nf\ng\nh\n",
"X\nb\nc\nd\ne\nf\ng\nh\n",
array(
'- a',
'+ X',
"= \nb",
'.',
),
pht('Summary diff with head change.'));
$this->assertSummaryProseParts(
"a\nb\nc\nd\ne\nf\ng\nh\n",
"a\nb\nc\nd\ne\nf\ng\nX\n",
array(
'.',
"= g\n",
'- h',
'+ X',
"= \n",
),
pht('Summary diff with last change.'));
$this->assertProseParts(
'aaa aaa aaa aaa, bbb bbb bbb bbb.',
"aaa aaa aaa aaa, bbb bbb bbb bbb.\n\n- ccc ccc ccc",
array(
'= aaa aaa aaa aaa, bbb bbb bbb bbb.',
"+ \n\n- ccc ccc ccc",
),
pht('Diff with new trailing content.'));
$this->assertProseParts(
'aaa aaa aaa aaa, bbb bbb bbb bbb.',
'aaa aaa aaa aaa bbb bbb bbb bbb.',
array(
'= aaa aaa aaa aaa',
'- ,',
'= bbb bbb bbb bbb.',
),
pht('Diff with a removed comma.'));
$this->assertProseParts(
'aaa aaa aaa aaa, bbb bbb bbb bbb.',
"aaa aaa aaa aaa bbb bbb bbb bbb.\n\n- ccc ccc ccc!",
array(
'= aaa aaa aaa aaa',
'- ,',
'= bbb bbb bbb bbb.',
"+ \n\n- ccc ccc ccc!",
),
pht('Diff with a removed comma and new trailing content.'));
+ $this->assertProseParts(
+ '[ ] Walnuts',
+ '[X] Walnuts',
+ array(
+ '= [',
+ '- ',
+ '+ X',
+ '= ] Walnuts',
+ ),
+ pht('Diff adding a tickmark to a checkbox list.'));
+
+ $this->assertProseParts(
+ '[[ ./week49 ]]',
+ '[[ ./week50 ]]',
+ array(
+ '= [[ ./week',
+ '- 49',
+ '+ 50',
+ '= ]]',
+ ),
+ pht('Diff changing a remarkup wiki link target.'));
+
+ // Create a large corpus with many sentences and paragraphs.
+ $large_paragraph = 'xyz. ';
+ $large_paragraph = str_repeat($large_paragraph, 50);
+ $large_paragraph = rtrim($large_paragraph);
+
+ $large_corpus = $large_paragraph."\n\n";
+ $large_corpus = str_repeat($large_corpus, 50);
+ $large_corpus = rtrim($large_corpus);
+
+ $this->assertProseParts(
+ $large_corpus,
+ "aaa\n\n".$large_corpus."\n\nzzz",
+ array(
+ "+ aaa\n\n",
+ '= '.$large_corpus,
+ "+ \n\nzzz",
+ ),
+ pht('Adding initial and final lines to a large corpus.'));
+
}
private function assertProseParts($old, $new, array $expect_parts, $label) {
$engine = new PhutilProseDifferenceEngine();
$diff = $engine->getDiff($old, $new);
$parts = $diff->getParts();
$this->assertParts($expect_parts, $parts, $label);
}
private function assertSummaryProseParts(
$old,
$new,
array $expect_parts,
$label) {
$engine = new PhutilProseDifferenceEngine();
$diff = $engine->getDiff($old, $new);
$parts = $diff->getSummaryParts();
$this->assertParts($expect_parts, $parts, $label);
}
private function assertParts(
array $expect,
array $actual_parts,
$label) {
$actual = array();
foreach ($actual_parts as $actual_part) {
$type = $actual_part['type'];
$text = $actual_part['text'];
switch ($type) {
case '.':
$actual[] = $type;
break;
default:
$actual[] = "{$type} {$text}";
break;
}
}
$this->assertEqual($expect, $actual, $label);
}
}