diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php --- a/src/__phutil_library_map__.php +++ b/src/__phutil_library_map__.php @@ -132,6 +132,11 @@ 'PhutilCIDRBlock' => 'ip/PhutilCIDRBlock.php', 'PhutilCIDRList' => 'ip/PhutilCIDRList.php', 'PhutilCLikeCodeSnippetContextFreeGrammar' => 'grammar/code/PhutilCLikeCodeSnippetContextFreeGrammar.php', + 'PhutilCalendarContainerNode' => 'parser/calendar/data/PhutilCalendarContainerNode.php', + 'PhutilCalendarDocumentNode' => 'parser/calendar/data/PhutilCalendarDocumentNode.php', + 'PhutilCalendarEventNode' => 'parser/calendar/data/PhutilCalendarEventNode.php', + 'PhutilCalendarNode' => 'parser/calendar/data/PhutilCalendarNode.php', + 'PhutilCalendarRawNode' => 'parser/calendar/data/PhutilCalendarRawNode.php', 'PhutilCallbackFilterIterator' => 'utils/PhutilCallbackFilterIterator.php', 'PhutilCallbackSignalHandler' => 'future/exec/PhutilCallbackSignalHandler.php', 'PhutilChannel' => 'channel/PhutilChannel.php', @@ -222,6 +227,8 @@ 'PhutilHelpArgumentWorkflow' => 'parser/argument/workflow/PhutilHelpArgumentWorkflow.php', 'PhutilHgsprintfTestCase' => 'xsprintf/__tests__/PhutilHgsprintfTestCase.php', 'PhutilHighIntensityIntervalDaemon' => 'daemon/torture/PhutilHighIntensityIntervalDaemon.php', + 'PhutilICSParser' => 'parser/calendar/ics/PhutilICSParser.php', + 'PhutilICSParserTestCase' => 'parser/calendar/ics/__tests__/PhutilICSParserTestCase.php', 'PhutilINIParserException' => 'parser/exception/PhutilINIParserException.php', 'PhutilIPAddress' => 'ip/PhutilIPAddress.php', 'PhutilIPAddressTestCase' => 'ip/__tests__/PhutilIPAddressTestCase.php', @@ -699,6 +706,11 @@ 'PhutilCIDRBlock' => 'Phobject', 'PhutilCIDRList' => 'Phobject', 'PhutilCLikeCodeSnippetContextFreeGrammar' => 'PhutilCodeSnippetContextFreeGrammar', + 'PhutilCalendarContainerNode' => 'PhutilCalendarNode', + 'PhutilCalendarDocumentNode' => 'PhutilCalendarContainerNode', + 'PhutilCalendarEventNode' => 'PhutilCalendarNode', + 'PhutilCalendarNode' => 'Phobject', + 'PhutilCalendarRawNode' => 'PhutilCalendarNode', 'PhutilCallbackFilterIterator' => 'FilterIterator', 'PhutilCallbackSignalHandler' => 'PhutilSignalHandler', 'PhutilChannel' => 'Phobject', @@ -795,6 +807,8 @@ 'PhutilHelpArgumentWorkflow' => 'PhutilArgumentWorkflow', 'PhutilHgsprintfTestCase' => 'PhutilTestCase', 'PhutilHighIntensityIntervalDaemon' => 'PhutilTortureTestDaemon', + 'PhutilICSParser' => 'Phobject', + 'PhutilICSParserTestCase' => 'PhutilTestCase', 'PhutilINIParserException' => 'Exception', 'PhutilIPAddress' => 'Phobject', 'PhutilIPAddressTestCase' => 'PhutilTestCase', diff --git a/src/parser/calendar/data/PhutilCalendarContainerNode.php b/src/parser/calendar/data/PhutilCalendarContainerNode.php new file mode 100644 --- /dev/null +++ b/src/parser/calendar/data/PhutilCalendarContainerNode.php @@ -0,0 +1,30 @@ +children; + } + + final public function getChildrenOfType($type) { + $result = array(); + + foreach ($this->getChildren() as $key => $child) { + if ($child->getNodeType() != $type) { + continue; + } + $result[$key] = $child; + } + + return $result; + } + + final public function appendChild(PhutilCalendarNode $node) { + $this->children[] = $node; + return $this; + } + +} diff --git a/src/parser/calendar/data/PhutilCalendarDocumentNode.php b/src/parser/calendar/data/PhutilCalendarDocumentNode.php new file mode 100644 --- /dev/null +++ b/src/parser/calendar/data/PhutilCalendarDocumentNode.php @@ -0,0 +1,12 @@ +getChildrenOfType(PhutilCalendarEventNode::NODETYPE); + } + +} diff --git a/src/parser/calendar/data/PhutilCalendarEventNode.php b/src/parser/calendar/data/PhutilCalendarEventNode.php new file mode 100644 --- /dev/null +++ b/src/parser/calendar/data/PhutilCalendarEventNode.php @@ -0,0 +1,8 @@ +getPhobjectClassConstant('NODETYPE'); + } + + final public function setAttribute($key, $value) { + $this->attributes[$key] = $value; + return $this; + } + + final public function getAttribute($key, $default = null) { + return idx($this->attributes, $key, $default); + } + +} diff --git a/src/parser/calendar/data/PhutilCalendarRawNode.php b/src/parser/calendar/data/PhutilCalendarRawNode.php new file mode 100644 --- /dev/null +++ b/src/parser/calendar/data/PhutilCalendarRawNode.php @@ -0,0 +1,8 @@ +stack = array(); + $this->node = null; + $this->document = null; + + $lines = $this->unfoldICSLines($data); + + foreach ($lines as $line) { + $matches = null; + if (preg_match('(^BEGIN:(.*)\z)', $line, $matches)) { + $this->beginParsingNode($matches[1]); + } else if (preg_match('(^END:(.*)\z)', $line, $matches)) { + $this->endParsingNode($matches[1]); + } else { + $this->parseICSProperty($line); + } + } + + if (!$this->document) { + $this->raiseParseFailure( + pht( + 'Expected ICS document to define a "VCALENDAR" section.')); + } + + if ($this->stack) { + $this->raiseParseFailure( + pht( + 'Expected all "BEGIN:" sections in ICS document to have '. + 'corresponding "END:" sections.')); + } + + $document = $this->document; + $this->document = null; + + return $document; + } + + private function getNode() { + return $this->node; + } + + private function unfoldICSLines($data) { + $lines = phutil_split_lines($data, $retain_endings = false); + + // ICS files are wrapped at 75 characters, with overlong lines continued + // on the following line with an initial space or tab. Unwrap all of the + // lines in the file. + $last = null; + foreach ($lines as $idx => $line) { + if (!preg_match('/^[ \t]/', $line)) { + $last = $idx; + continue; + } + + if ($last === null) { + throw new Exception( + pht( + 'First line of ICS file begins with a space or tab, but this '. + 'marks a continuation line.')); + } + + $lines[$last] = $lines[$last].substr($line, 1); + unset($lines[$idx]); + } + + return $lines; + } + + private function beginParsingNode($type) { + $node = $this->getNode(); + $new_node = $this->newICSNode($type); + + if ($node) { + if ($node instanceof PhutilCalendarContainerNode) { + $node->appendChild($new_node); + } else { + $this->raiseParseFailure( + pht( + 'Found unexpected node "%s" inside node "%s".', + $new_node->getAttribute('ics.type'), + $node->getAttribute('ics.type'))); + } + } else { + if ($new_node instanceof PhutilCalendarDocumentNode) { + if ($this->document) { + $this->raiseParseFailure( + pht( + 'Found multiple "VCALENDAR" nodes in ICS document, '. + 'expected only one.')); + } else { + $this->document = $new_node; + } + } else { + $this->raiseParseFailure( + pht( + 'Expected ICS document to begin "BEGIN:VCALENDAR".')); + } + } + + $this->stack[] = $new_node; + $this->node = $new_node; + + return $this; + } + + private function newICSNode($type) { + switch ($type) { + case 'VCALENDAR': + $node = new PhutilCalendarDocumentNode(); + break; + case 'VEVENT': + $node = new PhutilCalendarEventNode(); + break; + default: + $node = new PhutilCalendarRawNode(); + break; + } + + $node->setAttribute('ics.type', $type); + + return $node; + } + + private function endParsingNode($type) { + $node = $this->getNode(); + if (!$node) { + $this->raiseParseFailure( + pht( + 'Found unexpected "END" without a "BEGIN".')); + } + + $old_type = $node->getAttribute('ics.type'); + if ($old_type != $type) { + $this->raiseParseFailure( + pht( + 'Found mismatched "BEGIN" ("%s") and "END" ("%s") sections.', + $old_type, + $type)); + } + + array_pop($this->stack); + if ($this->stack) { + $this->node = last($this->stack); + } else { + $this->node = null; + } + + return $this; + } + + private function parseICSProperty($line) { + $matches = null; + + // Properties begin with an alphanumeric name with no escaping, followed + // by either a ";" (to begin a list of parameters) or a ":" (to begin + // the actual field body). + + $ok = preg_match('(^([^;:]+)([;:])(.*)\z)', $line, $matches); + if (!$ok) { + $this->raiseParseFailure( + pht( + 'Found malformed line in ICS document: %s', + $line)); + } + + $name = $matches[1]; + $body = $matches[3]; + $has_parameters = ($matches[2] == ';'); + + $parameters = array(); + if ($has_parameters) { + // Parameters are a sensible name, a literal "=", a pile of magic, + // and then maybe a comma and another parameter. + + while (true) { + // We're going to get the first couple of parts first. + $ok = preg_match('(^([^=]+)=)', $body, $matches); + if (!$ok) { + $this->raiseParseFailure( + pht( + 'Found malformed property in ICS document: %s', + $body)); + } + + $param_name = $matches[1]; + $body = substr($body, strlen($matches[0])); + + // Now we're going to match zero or more values. + $param_values = array(); + while (true) { + // The value can either be a double-quoted string or an unquoted + // string, with some characters forbidden. + if (strlen($body) && $body[0] == '"') { + $is_quoted = true; + $ok = preg_match( + '(^"([^\x00-\x08\x10-\x19"]*)")', + $body, + $matches); + if (!$ok) { + $this->raiseParseFailure( + pht( + 'Found malformed double-quoted string in ICS document '. + 'parameter value: %s', + $body)); + } + } else { + $is_quoted = false; + $ok = preg_match( + '(^([^\x00-\x08\x10-\x19";:,]*))', + $body, + $matches); + if (!$ok) { + $this->raiseParseFailure( + pht( + 'Found malformed unquoted string in ICS document '. + 'parameter value: %s', + $body)); + } + } + + // NOTE: RFC5545 says "Property parameter values that are not in + // quoted-strings are case-insensitive." -- that is, the quoted and + // unquoted representations are not equivalent. Thus, preserve the + // original formatting in case we ever need to respect this. + + $param_values[] = array( + 'value' => $this->unescapeParameterValue($matches[1]), + 'quoted' => $is_quoted, + ); + + $body = substr($body, strlen($matches[0])); + if (!strlen($body)) { + $this->raiseParseFailure( + pht( + 'Expected ":" after parameters in ICS document property.')); + } + + // If we have a comma now, we're going to read another value. Strip + // it off and keep going. + if ($body[0] == ',') { + $body = substr($body, 1); + continue; + } + + // If we have a colon, this is the last value and also the last + // property. Break, then handle the colon below. + if ($body[0] == ':') { + break; + } + + // We aren't expecting anything else. + $this->raiseParseFailure( + pht( + 'Found unexpected text after reading parameter value: %s', + $body)); + } + + $parameters[] = array( + 'name' => $param_name, + 'values' => $param_values, + ); + + if ($body[0] == ':') { + $body = substr($body, 1); + break; + } + } + } + + $value = $this->unescapeFieldValue($name, $parameters, $body); + + $node = $this->getNode(); + $raw = $node->getAttribute('ics.properties', array()); + $raw[] = array( + 'name' => $name, + 'parameters' => $parameters, + 'value' => $value, + ); + $node->setAttribute('ics.properties', $raw); + } + + private function unescapeParameterValue($data) { + // The parameter grammar is adjusted by RFC6868 to permit escaping with + // carets. Remove that escaping. + + // This escaping is a bit weird because it's trying to be backwards + // compatible and the original spec didn't think about this and didn't + // provide much room to fix things. + + $out = ''; + $esc = false; + foreach (phutil_utf8v($data) as $c) { + if (!$esc) { + if ($c != '^') { + $out .= $c; + } else { + $esc = true; + } + } else { + switch ($c) { + case 'n': + $out .= "\n"; + break; + case '^': + $out .= '^'; + break; + case "'": + // NOTE: This is " " being decoded into a + // double quote! + $out .= '"'; + break; + default: + // NOTE: The caret is NOT an escape for any other characters. + // This is a "MUST" requirement of RFC6868. + $out .= '^'.$c; + break; + } + } + } + + // NOTE: Because caret on its own just means "caret" for backward + // compatibility, we don't warn if we're still in escaped mode once we + // reach the end of the string. + + return $out; + } + + private function unescapeFieldValue($name, array $parameters, $data) { + // NOTE: The encoding of the field value data is dependent on the field + // name (which defines a default encoding) and the parameters (which may + // include "VALUE", specifying a type of the data. + + $default_types = array( + 'CALSCALE' => 'TEXT', + 'METHOD' => 'TEXT', + 'PRODID' => 'TEXT', + 'VERSION' => 'TEXT', + + 'ATTACH' => 'URI', + 'CATEGORIES' => 'TEXT', + 'CLASS' => 'TEXT', + 'COMMENT' => 'TEXT', + 'DESCRIPTION' => 'TEXT', + + // TODO: The spec appears to contradict itself: it says that the value + // type is FLOAT, but it also says that this property value is actually + // two semicolon-separated values, which is not what FLOAT is defined as. + 'GEO' => 'TEXT', + + 'LOCATION' => 'TEXT', + 'PERCENT-COMPLETE' => 'INTEGER', + 'PRIORITY' => 'INTEGER', + 'RESOURCES' => 'TEXT', + 'STATUS' => 'TEXT', + 'SUMMARY' => 'TEXT', + + 'COMPLETED' => 'DATE-TIME', + 'DTEND' => 'DATE-TIME', + 'DUE' => 'DATE-TIME', + 'DTSTART' => 'DATE-TIME', + 'DURATION' => 'DURATION', + 'FREEBUSY' => 'PERIOD', + 'TRANSP' => 'TEXT', + + 'TZID' => 'TEXT', + 'TZNAME' => 'TEXT', + 'TZOFFSETFROM' => 'UTC-OFFSET', + 'TZOFFSETTO' => 'UTC-OFFSET', + 'TZURL' => 'URI', + + 'ATTENDEE' => 'CAL-ADDRESS', + 'CONTACT' => 'TEXT', + 'ORGANIZER' => 'CAL-ADDRESS', + 'RECURRENCE-ID' => 'DATE-TIME', + 'RELATED-TO' => 'TEXT', + 'URL' => 'URI', + 'UID' => 'TEXT', + 'EXDATE' => 'DATE-TIME', + 'RDATE' => 'DATE-TIME', + 'RRULE' => 'RECUR', + + 'ACTION' => 'TEXT', + 'REPEAT' => 'INTEGER', + 'TRIGGER' => 'DURATION', + + 'CREATED' => 'DATE-TIME', + 'DTSTAMP' => 'DATE-TIME', + 'LAST-MODIFIED' => 'DATE-TIME', + 'SEQUENCE' => 'INTEGER', + + 'REQUEST-STATUS' => 'TEXT', + ); + + $value_type = idx($default_types, $name, 'TEXT'); + + foreach ($parameters as $parameter) { + if ($parameter['name'] == 'VALUE') { + $value_type = idx(head($parameter['values']), 'value'); + } + } + + switch ($value_type) { + case 'BINARY': + $result = base64_decode($data); + if ($result === false) { + $this->raiseParseFailure( + pht( + 'Unable to decode base64 data: %s', + $data)); + } + break; + case 'BOOLEAN': + $map = array( + 'true' => true, + 'false' => false, + ); + $result = phutil_utf8_strtolower($data); + if (!isset($map[$result])) { + $this->raiseParseFailure( + pht( + 'Unexpected BOOLEAN value "%s".', + $data)); + } + $result = $map[$result]; + break; + case 'CAL-ADDRESS': + $result = $data; + break; + case 'DATE': + // This is a comma-separated list of "YYYYMMDD" values. + $result = explode(',', $data); + break; + case 'DATE-TIME': + $result = explode(',', $data); + break; + case 'DURATION': + $result = explode(',', $data); + break; + case 'FLOAT': + $result = explode(',', $data); + foreach ($result as $k => $v) { + $result[$k] = (float)$v; + } + break; + case 'INTEGER': + $result = explode(',', $data); + foreach ($result as $k => $v) { + $result[$k] = (int)$v; + } + break; + case 'PERIOD': + $result = explode(',', $data); + break; + case 'RECUR': + $result = $data; + break; + case 'TEXT': + $result = $this->unescapeTextValue($data); + break; + case 'TIME': + $result = explode(',', $data); + break; + case 'URI': + $result = $data; + break; + case 'UTC-OFFSET': + $result = $data; + break; + default: + // RFC5545 says we MUST preserve the data for any types we don't + // recognize. + $result = $data; + break; + } + + return array( + 'type' => $value_type, + 'value' => $result, + 'raw' => $data, + ); + } + + private function unescapeTextValue($data) { + $result = array(); + + $buf = ''; + $esc = false; + foreach (phutil_utf8v($data) as $c) { + if (!$esc) { + if ($c == '\\') { + $esc = true; + } else if ($c == ',') { + $result[] = $buf; + $buf = ''; + } else { + $buf .= $c; + } + } else { + switch ($c) { + case 'n': + case 'N': + $buf .= "\n"; + break; + default: + $buf .= $c; + break; + } + } + } + + if ($esc) { + $this->raiseParsFailure( + pht( + 'ICS document contains TEXT value ending with unescaped '. + 'backslash.')); + } + + $result[] = $buf; + + return $result; + } + + private function raiseParseFailure($message) { + throw new Exception($message); + } + +} diff --git a/src/parser/calendar/ics/__tests__/PhutilICSParserTestCase.php b/src/parser/calendar/ics/__tests__/PhutilICSParserTestCase.php new file mode 100644 --- /dev/null +++ b/src/parser/calendar/ics/__tests__/PhutilICSParserTestCase.php @@ -0,0 +1,80 @@ +parseICSDocument('simple.ics'); + + $events = $document->getEvents(); + $this->assertEqual(1, count($events)); + + $event = head($events); + $this->assertEqual( + array( + array( + 'name' => 'CREATED', + 'parameters' => array(), + 'value' => array( + 'type' => 'DATE-TIME', + 'value' => array( + '20160908T172702Z', + ), + 'raw' => '20160908T172702Z', + ), + ), + array( + 'name' => 'UID', + 'parameters' => array(), + 'value' => array( + 'type' => 'TEXT', + 'value' => array( + '1CEB57AF-0C9C-402D-B3BD-D75BD4843F68', + ), + 'raw' => '1CEB57AF-0C9C-402D-B3BD-D75BD4843F68', + ), + ), + array( + 'name' => 'DTEND', + 'parameters' => array( + array( + 'name' => 'TZID', + 'values' => array( + array( + 'value' => 'America/Los_Angeles', + 'quoted' => false, + ), + ), + ), + ), + 'value' => array( + 'type' => 'DATE-TIME', + 'value' => array( + '20160915T100000', + ), + 'raw' => '20160915T100000', + ), + ), + array( + 'name' => 'SUMMARY', + 'parameters' => array(), + 'value' => array( + 'type' => 'TEXT', + 'value' => array( + 'Example Event', + ), + 'raw' => 'Example Event', + ), + ), + ), + $event->getAttribute('ics.properties')); + } + + private function parseICSDocument($name) { + $path = dirname(__FILE__).'/data/'.$name; + $data = Filesystem::readFile($path); + return id(new PhutilICSParser()) + ->parseICSData($data); + } + + +} diff --git a/src/parser/calendar/ics/__tests__/data/simple.ics b/src/parser/calendar/ics/__tests__/data/simple.ics new file mode 100644 --- /dev/null +++ b/src/parser/calendar/ics/__tests__/data/simple.ics @@ -0,0 +1,10 @@ +BEGIN:VCALENDAR +VERSION:2.0 +CALSCALE:GREGORIAN +BEGIN:VEVENT +CREATED:20160908T172702Z +UID:1CEB57AF-0C9C-402D-B3BD-D75BD4843F68 +DTEND;TZID=America/Los_Angeles:20160915T100000 +SUMMARY:Example Event +END:VEVENT +END:VCALENDAR