diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php --- a/src/__phutil_library_map__.php +++ b/src/__phutil_library_map__.php @@ -203,6 +203,7 @@ 'PhutilCowsayTestCase' => 'utils/__tests__/PhutilCowsayTestCase.php', 'PhutilCsprintfTestCase' => 'xsprintf/__tests__/PhutilCsprintfTestCase.php', 'PhutilCzechLocale' => 'internationalization/locales/PhutilCzechLocale.php', + 'PhutilDOMNode' => 'parser/html/PhutilDOMNode.php', 'PhutilDaemon' => 'daemon/PhutilDaemon.php', 'PhutilDaemonHandle' => 'daemon/PhutilDaemonHandle.php', 'PhutilDaemonOverseer' => 'daemon/PhutilDaemonOverseer.php', @@ -260,6 +261,8 @@ 'PhutilGitURI' => 'parser/PhutilGitURI.php', 'PhutilGitURITestCase' => 'parser/__tests__/PhutilGitURITestCase.php', 'PhutilGoogleAuthAdapter' => 'auth/PhutilGoogleAuthAdapter.php', + 'PhutilHTMLParser' => 'parser/html/PhutilHTMLParser.php', + 'PhutilHTMLParserTestCase' => 'parser/html/__tests__/PhutilHTMLParserTestCase.php', 'PhutilHTTPEngineExtension' => 'future/http/PhutilHTTPEngineExtension.php', 'PhutilHTTPResponse' => 'parser/http/PhutilHTTPResponse.php', 'PhutilHTTPResponseParser' => 'parser/http/PhutilHTTPResponseParser.php', @@ -859,6 +862,7 @@ 'PhutilCowsayTestCase' => 'PhutilTestCase', 'PhutilCsprintfTestCase' => 'PhutilTestCase', 'PhutilCzechLocale' => 'PhutilLocale', + 'PhutilDOMNode' => 'Phobject', 'PhutilDaemon' => 'Phobject', 'PhutilDaemonHandle' => 'Phobject', 'PhutilDaemonOverseer' => 'Phobject', @@ -916,6 +920,8 @@ 'PhutilGitURI' => 'Phobject', 'PhutilGitURITestCase' => 'PhutilTestCase', 'PhutilGoogleAuthAdapter' => 'PhutilOAuthAuthAdapter', + 'PhutilHTMLParser' => 'Phobject', + 'PhutilHTMLParserTestCase' => 'PhutilTestCase', 'PhutilHTTPEngineExtension' => 'Phobject', 'PhutilHTTPResponse' => 'Phobject', 'PhutilHTTPResponseParser' => 'Phobject', diff --git a/src/markup/engine/remarkup/blockrule/PhutilRemarkupTableBlockRule.php b/src/markup/engine/remarkup/blockrule/PhutilRemarkupTableBlockRule.php --- a/src/markup/engine/remarkup/blockrule/PhutilRemarkupTableBlockRule.php +++ b/src/markup/engine/remarkup/blockrule/PhutilRemarkupTableBlockRule.php @@ -11,7 +11,7 @@ while (isset($lines[$cursor])) { $num_lines++; - if (preg_match('@$@i', $lines[$cursor])) { + if (preg_match('@\s*$@i', $lines[$cursor])) { break; } $cursor++; @@ -22,86 +22,117 @@ } public function markupText($text, $children) { - $matches = array(); + $root = id(new PhutilHTMLParser()) + ->parseDocument($text); - if (!preg_match('@^\s*(.*)
$@si', $text, $matches)) { - return $this->fail( - $text, - pht('Bad table (expected %s)', '...
')); - } + $nodes = $root->selectChildrenWithTags(array('table')); - $body = $matches[1]; + $out = array(); + $seen_table = false; + foreach ($nodes as $node) { + if ($node->isContentNode()) { + $content = $node->getContent(); - $row_fragment = '(?:\s*(.*)\s*)'; - $cell_fragment = '(?:\s*<(td|th)>(.*)\s*)'; + if (!strlen(trim($content))) { + // Ignore whitespace. + continue; + } - // Test that the body contains only valid rows. - if (!preg_match('@^'.$row_fragment.'+$@Usi', $body)) { - return $this->fail( - $body, - pht('Bad table syntax (expected rows %s)', '...')); - } + // If we find other content, fail the rule. This can happen if the + // input is two consecutive table tags on one line with some text + // in between them, which we currently forbid. + return $text; + } else { + // If we have multiple table tags, just return the raw text. + if ($seen_table) { + return $text; + } + $seen_table = true; - // Capture the rows. - $row_regex = '@'.$row_fragment.'@Usi'; - if (!preg_match_all($row_regex, $body, $matches, PREG_SET_ORDER)) { - throw new Exception( - pht('Bug in Remarkup tables, parsing fails for input: %s', $text)); + $out[] = $this->newTable($node); + } } - $out_rows = array(); + return phutil_implode_html('', $out); + } - $rows = $matches; - foreach ($rows as $row) { - $content = $row[1]; + private function newTable(PhutilDOMNode $table) { + $nodes = $table->selectChildrenWithTags( + array( + 'colgroup', + 'tr', + )); - // Test that the row contains only valid cells. - if (!preg_match('@^'.$cell_fragment.'+$@Usi', $content)) { - return $this->fail( - $content, - pht('Bad table syntax (expected cells %s)', '...')); - } + $colgroup = null; + $rows = array(); + + foreach ($nodes as $node) { + if ($node->isContentNode()) { + $content = $node->getContent(); + + // If this is whitespace, ignore it. + if (!strlen(trim($content))) { + continue; + } - // Capture the cells. - $cell_regex = '@'.$cell_fragment.'@Usi'; - if (!preg_match_all($cell_regex, $content, $matches, PREG_SET_ORDER)) { - throw new Exception( - pht('Bug in Remarkup tables, parsing fails for input: %s', $text)); + // If we have nonempty content between the rows, this isn't a valid + // table. We can't really do anything reasonable with this, so just + // fail out and render the raw text. + return $table->newRawString(); } - $out_cells = array(); - foreach ($matches as $cell) { - $cell_type = $cell[1]; - $cell_content = $cell[2]; + if ($node->getTagName() === 'colgroup') { + // This table has multiple "" tags. Just bail out. + if ($colgroup !== null) { + return $table->newRawString(); + } - $out_cells[] = array( - 'type' => $cell_type, - 'content' => $this->applyRules($cell_content), - ); + // This table has a "" after a "". We could parse + // this, but just reject it out of an abundance of caution. + if ($rows) { + return $table->newRawString(); + } + + $colgroup = $node; + continue; } - $out_rows[] = array( - 'type' => 'tr', - 'content' => $out_cells, - ); + $rows[] = $node; } - return $this->renderRemarkupTable($out_rows); - } + $row_specs = array(); - private function fail($near, $message) { - $message = sprintf( - '%s near: %s', - $message, - id(new PhutilUTF8StringTruncator()) - ->setMaximumGlyphs(32000) - ->truncateString($near)); + foreach ($rows as $row) { + $cells = $row->selectChildrenWithTags(array('td', 'th')); - if ($this->getEngine()->isTextMode()) { - return '('.$message.')'; + $cell_specs = array(); + foreach ($cells as $cell) { + if ($cell->isContentNode()) { + $content = $node->getContent(); + + if (!strlen(trim($content))) { + continue; + } + + return $table->newRawString(); + } + + $content = $cell->getRawContentString(); + $content = $this->applyRules($content); + + $cell_specs[] = array( + 'type' => $cell->getTagName(), + 'content' => $content, + ); + } + + $row_specs[] = array( + 'type' => 'tr', + 'content' => $cell_specs, + ); } - return hsprintf('
%s
', $message); + return $this->renderRemarkupTable($row_specs); } } diff --git a/src/parser/html/PhutilDOMNode.php b/src/parser/html/PhutilDOMNode.php new file mode 100644 --- /dev/null +++ b/src/parser/html/PhutilDOMNode.php @@ -0,0 +1,193 @@ +content = $content; + return $this; + } + + public function getContent() { + return $this->content; + } + + public function isContentNode() { + return ($this->content !== null); + } + + public function setTagName($tag_name) { + $this->tagName = $tag_name; + return $this; + } + + public function getTagName() { + return $this->tagName; + } + + public function appendChild(PhutilDOMNode $node) { + $node->parentNode = $this; + $this->children[] = $node; + return $this; + } + + public function getChildren() { + return $this->children; + } + + public function getParentNode() { + return $this->parentNode; + } + + public function setAttributes(array $attributes) { + $this->attributes = $attributes; + return $this; + } + + public function getAttributes() { + return $this->attributes; + } + + public function setRawString($raw_string) { + $this->rawString = $raw_string; + return $this; + } + + public function getRawString() { + return $this->rawString; + } + + public function toDictionary() { + if ($this->isContentNode()) { + return array( + 'content' => $this->content, + ); + } else { + $children = array(); + + foreach ($this->getChildren() as $child) { + $children[] = $child->toDictionary(); + } + + return array( + 'tag' => $this->getTagName(), + 'attributes' => $this->getAttributes(), + 'children' => $children, + ); + } + } + + /** + * Get a list of the children of a given DOM node, treating unexpected + * tags as if they were raw content. + */ + public function selectChildrenWithTags(array $tag_list) { + $tag_map = array_fuse($tag_list); + + $nodes = array(); + foreach ($this->getChildren() as $child) { + // If this is already a content node, just keep it as-is. + if ($child->isContentNode()) { + $nodes[] = $child; + continue; + } + + $tag_name = $child->getTagName(); + + // If this is a tag that we're allowing, keep it as-is. + if (isset($tag_map[$tag_name])) { + $nodes[] = $child; + continue; + } + + // Otherwise, this is some other tag. Convert it into a content + // node. + + $raw_content = $child->getRawString(); + + $nodes[] = id(new self()) + ->setContent($raw_content) + ->setRawContent($raw_content); + } + + return $this->mergeContentNodes($nodes); + } + + public function getRawContentString() { + $content_node = $this->selectChildrenWithTags(array()); + + if (!$content_node) { + return ''; + } + + return head($content_node)->getRawString(); + } + + public function mergeContent() { + $this->children = $this->mergeContentNodes($this->children); + + foreach ($this->getChildren() as $child) { + $child->parentNode = $this; + $child->mergeContent(); + } + + return $this; + } + + /** + * Given a list of nodes, combine sequences of multiple adjacent content + * nodes into single nodes. + */ + private function mergeContentNodes(array $nodes) { + $list = array(); + $content_block = array(); + foreach ($this->getChildren() as $child) { + if ($child->isContentNode()) { + $content_block[] = $child; + continue; + } + + $list[] = $content_block; + $content_block = array(); + + $list[] = $child; + } + + $list[] = $content_block; + + $results = array(); + foreach ($list as $item) { + if (!is_array($item)) { + $results[] = $item; + continue; + } + + if (!$item) { + continue; + } + + $parts = array(); + foreach ($item as $content_node) { + $parts[] = $content_node->getRawString(); + } + $parts = implode('', $parts); + + if (!strlen($parts)) { + continue; + } + + $results[] = id(new self()) + ->setContent($parts) + ->setRawString($parts); + } + + return $results; + } + +} diff --git a/src/parser/html/PhutilHTMLParser.php b/src/parser/html/PhutilHTMLParser.php new file mode 100644 --- /dev/null +++ b/src/parser/html/PhutilHTMLParser.php @@ -0,0 +1,426 @@ +"). Non-tag + // content is anything else. + + $segment_pos = 0; + $segments = array(); + $in_tag = false; + + for ($ii = 0; $ii < strlen($corpus); $ii++) { + $c = $corpus[$ii]; + + if ($in_tag && ($c === '>')) { + if ($segment_pos !== null) { + $segments[] = array( + 'tag' => $in_tag, + 'pos' => $segment_pos, + 'end' => $ii + 1, + ); + } + + $segment_pos = $ii + 1; + $in_tag = false; + continue; + } + + if (!$in_tag && ($c === '<')) { + $segments[] = array( + 'tag' => $in_tag, + 'pos' => $segment_pos, + 'end' => $ii, + ); + + $segment_pos = $ii; + $in_tag = true; + continue; + } + } + + // Add whatever content was left at the end of the string. If we were in + // a tag but did not find a closing ">", we treat this as normal content. + $segments[] = array( + 'tag' => false, + 'pos' => $segment_pos, + 'end' => $ii, + ); + + // Slice the marked segments out of the raw corpus so we get a list of + // "tag" strings and a list of "non-tag" strings. + + $parts = array(); + $corpus_length = strlen($corpus); + foreach ($segments as $segment) { + $tag = $segment['tag']; + $pos = $segment['pos']; + $len = $segment['end'] - $pos; + + // If this is a tag, we'll drop the "<" at the beginning and the ">" + // at the end here. + if ($tag) { + $slice_pos = $pos + 1; + $slice_len = $len - 2; + } else { + $slice_pos = $pos; + $slice_len = $len; + } + + if (($slice_pos < $corpus_length) && ($slice_len > 0)) { + $content = substr($corpus, $slice_pos, $slice_len); + } else { + $content = ''; + } + + $parts[] = array( + 'tag' => $tag, + 'pos' => $pos, + 'len' => $len, + 'content' => $content, + ); + } + + $root = new PhutilDOMNode(); + $this->setCursor($root); + + foreach ($parts as $part) { + $tag = $this->newTagDOMNode($part); + + if ($tag !== null) { + continue; + } + + $content = $part['content']; + + // If this part is a tag, restore the angle brackets. + if ($part['tag']) { + $content = '<'.$content.'>'; + } + + $node = id(new PhutilDOMNode()) + ->setContent($content) + ->setRawString($content); + + $this->getCursor()->appendChild($node); + } + + $root->mergeContent(); + + return $root; + } + + private function newTagDOMNode(array $part) { + if (!$part['tag']) { + return null; + } + + $raw_content = $part['content']; + $content = $raw_content; + + $content = trim($content); + $content_len = strlen($content); + + // If the tag content begins with "/", like "", strip the slash + // off and mark this as a closing tag. + $is_close = false; + if ($content_len > 0 && $content[0] === '/') { + $is_close = true; + $content = substr($content, 1); + $content = trim($content); + $content_len = strlen($content); + } + + // If the tag content ends with "/", like "", strip the slash off + // and mark this as self-closing. + $self_close = false; + if ($content_len > 0 && $content[$content_len - 1] === '/') { + $self_close = true; + $content = substr($content, 0, $content_len - 1); + $content = trim($content); + $content_len = strlen($content); + } + + // If this tag is both a closing tag and a self-closing tag, it is + // not formatted correctly. Treat it as content. + if ($self_close && $is_close) { + return null; + } + + // Now, split the rest of the tag into the tag name and tag attributes. + $pieces = preg_split('/\s+/', $content, 2); + $tag_name = $pieces[0]; + + if (count($pieces) > 1) { + $attributes = $pieces[1]; + } else { + $attributes = ''; + } + + // If there's no tag name, this tag is not valid. Treat it as content. + if (!strlen($tag_name)) { + return null; + } + + // If this is a closing tag with attributes, it's not valid. Treat it + // as content. + if ($is_close && strlen($attributes)) { + return null; + } + + $tag_name = phutil_utf8_strtolower($tag_name); + + // If we find a valid closing tag, try to find a matching tag on the stack. + // If we find a matching tag, close it. + // If we do not find a matching tag, treat the closing tag as content. + if ($is_close) { + $cursor = $this->getCursor(); + + while ($cursor) { + if ($cursor->getTagName() === $tag_name) { + $parent = $cursor->getParentNode(); + $this->setCursor($parent); + return true; + } + $cursor = $cursor->getParentNode(); + } + + return null; + } + + if (strlen($attributes)) { + $attribute_map = $this->parseAttributes($attributes); + // If the attributes can't be parsed, treat the tag as content. + if ($attribute_map === null) { + return null; + } + } else { + $attribute_map = array(); + } + + $node = id(new PhutilDOMNode()) + ->setTagName($tag_name) + ->setAttributes($attribute_map) + ->setRawString('<'.$raw_content.'>'); + + $cursor = $this->getCursor(); + $cursor->appendChild($node); + + if (!$self_close) { + $this->setCursor($node); + } + + return $node; + } + + private function setCursor(PhutilDOMNode $cursor) { + $this->cursor = $cursor; + return $this; + } + + private function getCursor() { + return $this->cursor; + } + + private function parseAttributes($attributes) { + $state = 'key'; + + $whitespace = array( + ' ' => true, + "\n" => true, + "\t" => true, + "\r" => true, + ); + + $map = array(); + $len = strlen($attributes); + $key_pos = null; + for ($ii = 0; $ii < $len; $ii++) { + $c = $attributes[$ii]; + $is_space = isset($whitespace[$c]); + + switch ($state) { + case 'key': + // We're looking for the start of an attribute name. + + // Skip over any whitespace. + if ($is_space) { + break; + } + + // If we see "". + if (isset($map[$name_value])) { + return null; + } + } + + // If we find an "=", that's the end of the name. Next, we're going + // to parse a value. + if ($c === '=') { + $state = 'value'; + break; + } + + // If we find whitespace, that's the end of the name. We're going + // to look for an "=". + if ($is_space) { + $state = 'equals'; + break; + } + + break; + case 'equals': + // We've parsed the name of an attribute and are looking for an + // "=" character. + + // Skip over any whitespace. + if ($is_space) { + break; + } + + // This is the "=" we're looking for, so we're good to go. + if ($c === '=') { + $state = 'value'; + break; + } + + // If this is anything else, this is an attribute name with no + // value. Treat it as "true" and move on. This corresponds to an + // input like "". + $map[$name_value] = true; + $name_pos = $ii; + $state = 'name'; + break; + case 'value': + // We've parsed an "=" and are looking for the start of a value. + + // Skip over any whitespace. + if ($is_space) { + break; + } + + // Don't accept "parseDocument($input); + + // We're just testing the child list of the root node since this + // reduces the amount of boilerplate in the test cases. + $list = array(); + foreach ($document->getChildren() as $child) { + $list[] = $child->toDictionary(); + } + + $this->assertEqual( + $expect, + $list, + pht('DOM tree for "%s".', $test)); + } + } + +} diff --git a/src/parser/html/__tests__/data/attributes-basic.txt b/src/parser/html/__tests__/data/attributes-basic.txt new file mode 100644 --- /dev/null +++ b/src/parser/html/__tests__/data/attributes-basic.txt @@ -0,0 +1,13 @@ + +~~~~~~~~~~ +[ + { + "tag": "a", + "attributes": { + "b": "1", + "c": true, + "d": "e" + }, + "children": [] + } +] diff --git a/src/parser/html/__tests__/data/content-angle.txt b/src/parser/html/__tests__/data/content-angle.txt new file mode 100644 --- /dev/null +++ b/src/parser/html/__tests__/data/content-angle.txt @@ -0,0 +1,7 @@ +o< quack +~~~~~~~~~~ +[ + { + "content": "o< quack" + } +] diff --git a/src/parser/html/__tests__/data/content-simple.txt b/src/parser/html/__tests__/data/content-simple.txt new file mode 100644 --- /dev/null +++ b/src/parser/html/__tests__/data/content-simple.txt @@ -0,0 +1,7 @@ +quack +~~~~~~~~~~ +[ + { + "content": "quack" + } +] diff --git a/src/parser/html/__tests__/data/tag-mismatch.txt b/src/parser/html/__tests__/data/tag-mismatch.txt new file mode 100644 --- /dev/null +++ b/src/parser/html/__tests__/data/tag-mismatch.txt @@ -0,0 +1,21 @@ + +~~~~~~~~~~ +[ + { + "tag": "a", + "attributes": {}, + "children": [ + { + "tag": "b", + "attributes": {}, + "children": [ + { + "tag": "c", + "attributes": {}, + "children": [] + } + ] + } + ] + } +] diff --git a/src/parser/html/__tests__/data/tag-simple.txt b/src/parser/html/__tests__/data/tag-simple.txt new file mode 100644 --- /dev/null +++ b/src/parser/html/__tests__/data/tag-simple.txt @@ -0,0 +1,135 @@ + + +< a/> + + +< a /> +< a / > + + +< a>< /a> +< a >< /a > + + +< a>< / a> +< a >< / a > +~~~~~~~~~~ +[ + { + "tag": "a", + "attributes": {}, + "children": [] + }, + { + "content": "\n" + }, + { + "tag": "a", + "attributes": {}, + "children": [] + }, + { + "content": "\n" + }, + { + "tag": "a", + "attributes": {}, + "children": [] + }, + { + "content": "\n" + }, + { + "tag": "a", + "attributes": {}, + "children": [] + }, + { + "content": "\n" + }, + { + "tag": "a", + "attributes": {}, + "children": [] + }, + { + "content": "\n" + }, + { + "tag": "a", + "attributes": {}, + "children": [] + }, + { + "content": "\n" + }, + { + "tag": "a", + "attributes": {}, + "children": [] + }, + { + "content": "\n" + }, + { + "tag": "a", + "attributes": {}, + "children": [] + }, + { + "content": "\n" + }, + { + "tag": "a", + "attributes": {}, + "children": [] + }, + { + "content": "\n" + }, + { + "tag": "a", + "attributes": {}, + "children": [] + }, + { + "content": "\n" + }, + { + "tag": "a", + "attributes": {}, + "children": [] + }, + { + "content": "\n" + }, + { + "tag": "a", + "attributes": {}, + "children": [] + }, + { + "content": "\n" + }, + { + "tag": "a", + "attributes": {}, + "children": [] + }, + { + "content": "\n" + }, + { + "tag": "a", + "attributes": {}, + "children": [] + }, + { + "content": "\n" + }, + { + "tag": "a", + "attributes": {}, + "children": [] + } +] diff --git a/src/parser/html/__tests__/data/tag-table.txt b/src/parser/html/__tests__/data/tag-table.txt new file mode 100644 --- /dev/null +++ b/src/parser/html/__tests__/data/tag-table.txt @@ -0,0 +1,39 @@ +
ab
+~~~~~~~~~~ +[ + { + "tag": "table", + "attributes": {}, + "children": [ + { + "tag": "tr", + "attributes": {}, + "children": [ + { + "tag": "td", + "attributes": {}, + "children": [ + { + "content": "a" + } + ] + }, + { + "tag": "td", + "attributes": {}, + "children": [ + { + "content": "b" + } + ] + }, + { + "tag": "td", + "attributes": {}, + "children": [] + } + ] + } + ] + } +]