Page MenuHomePhabricator

D20568.diff
No OneTemporary

D20568.diff

diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php
--- a/src/__phutil_library_map__.php
+++ b/src/__phutil_library_map__.php
@@ -203,6 +203,7 @@
'PhutilCowsayTestCase' => 'utils/__tests__/PhutilCowsayTestCase.php',
'PhutilCsprintfTestCase' => 'xsprintf/__tests__/PhutilCsprintfTestCase.php',
'PhutilCzechLocale' => 'internationalization/locales/PhutilCzechLocale.php',
+ 'PhutilDOMNode' => 'parser/html/PhutilDOMNode.php',
'PhutilDaemon' => 'daemon/PhutilDaemon.php',
'PhutilDaemonHandle' => 'daemon/PhutilDaemonHandle.php',
'PhutilDaemonOverseer' => 'daemon/PhutilDaemonOverseer.php',
@@ -260,6 +261,8 @@
'PhutilGitURI' => 'parser/PhutilGitURI.php',
'PhutilGitURITestCase' => 'parser/__tests__/PhutilGitURITestCase.php',
'PhutilGoogleAuthAdapter' => 'auth/PhutilGoogleAuthAdapter.php',
+ 'PhutilHTMLParser' => 'parser/html/PhutilHTMLParser.php',
+ 'PhutilHTMLParserTestCase' => 'parser/html/__tests__/PhutilHTMLParserTestCase.php',
'PhutilHTTPEngineExtension' => 'future/http/PhutilHTTPEngineExtension.php',
'PhutilHTTPResponse' => 'parser/http/PhutilHTTPResponse.php',
'PhutilHTTPResponseParser' => 'parser/http/PhutilHTTPResponseParser.php',
@@ -859,6 +862,7 @@
'PhutilCowsayTestCase' => 'PhutilTestCase',
'PhutilCsprintfTestCase' => 'PhutilTestCase',
'PhutilCzechLocale' => 'PhutilLocale',
+ 'PhutilDOMNode' => 'Phobject',
'PhutilDaemon' => 'Phobject',
'PhutilDaemonHandle' => 'Phobject',
'PhutilDaemonOverseer' => 'Phobject',
@@ -916,6 +920,8 @@
'PhutilGitURI' => 'Phobject',
'PhutilGitURITestCase' => 'PhutilTestCase',
'PhutilGoogleAuthAdapter' => 'PhutilOAuthAuthAdapter',
+ 'PhutilHTMLParser' => 'Phobject',
+ 'PhutilHTMLParserTestCase' => 'PhutilTestCase',
'PhutilHTTPEngineExtension' => 'Phobject',
'PhutilHTTPResponse' => 'Phobject',
'PhutilHTTPResponseParser' => 'Phobject',
diff --git a/src/markup/engine/remarkup/blockrule/PhutilRemarkupTableBlockRule.php b/src/markup/engine/remarkup/blockrule/PhutilRemarkupTableBlockRule.php
--- a/src/markup/engine/remarkup/blockrule/PhutilRemarkupTableBlockRule.php
+++ b/src/markup/engine/remarkup/blockrule/PhutilRemarkupTableBlockRule.php
@@ -11,7 +11,7 @@
while (isset($lines[$cursor])) {
$num_lines++;
- if (preg_match('@</table>$@i', $lines[$cursor])) {
+ if (preg_match('@</table>\s*$@i', $lines[$cursor])) {
break;
}
$cursor++;
@@ -22,86 +22,117 @@
}
public function markupText($text, $children) {
- $matches = array();
+ $root = id(new PhutilHTMLParser())
+ ->parseDocument($text);
- if (!preg_match('@^\s*<table>(.*)</table>$@si', $text, $matches)) {
- return $this->fail(
- $text,
- pht('Bad table (expected %s)', '<table>...</table>'));
- }
+ $nodes = $root->selectChildrenWithTags(array('table'));
- $body = $matches[1];
+ $out = array();
+ $seen_table = false;
+ foreach ($nodes as $node) {
+ if ($node->isContentNode()) {
+ $content = $node->getContent();
- $row_fragment = '(?:\s*<tr>(.*)</tr>\s*)';
- $cell_fragment = '(?:\s*<(td|th)>(.*)</(?:td|th)>\s*)';
+ if (!strlen(trim($content))) {
+ // Ignore whitespace.
+ continue;
+ }
- // Test that the body contains only valid rows.
- if (!preg_match('@^'.$row_fragment.'+$@Usi', $body)) {
- return $this->fail(
- $body,
- pht('Bad table syntax (expected rows %s)', '<tr>...</tr>'));
- }
+ // If we find other content, fail the rule. This can happen if the
+ // input is two consecutive table tags on one line with some text
+ // in between them, which we currently forbid.
+ return $text;
+ } else {
+ // If we have multiple table tags, just return the raw text.
+ if ($seen_table) {
+ return $text;
+ }
+ $seen_table = true;
- // Capture the rows.
- $row_regex = '@'.$row_fragment.'@Usi';
- if (!preg_match_all($row_regex, $body, $matches, PREG_SET_ORDER)) {
- throw new Exception(
- pht('Bug in Remarkup tables, parsing fails for input: %s', $text));
+ $out[] = $this->newTable($node);
+ }
}
- $out_rows = array();
+ return phutil_implode_html('', $out);
+ }
- $rows = $matches;
- foreach ($rows as $row) {
- $content = $row[1];
+ private function newTable(PhutilDOMNode $table) {
+ $nodes = $table->selectChildrenWithTags(
+ array(
+ 'colgroup',
+ 'tr',
+ ));
- // Test that the row contains only valid cells.
- if (!preg_match('@^'.$cell_fragment.'+$@Usi', $content)) {
- return $this->fail(
- $content,
- pht('Bad table syntax (expected cells %s)', '<td>...</td>'));
- }
+ $colgroup = null;
+ $rows = array();
+
+ foreach ($nodes as $node) {
+ if ($node->isContentNode()) {
+ $content = $node->getContent();
+
+ // If this is whitespace, ignore it.
+ if (!strlen(trim($content))) {
+ continue;
+ }
- // Capture the cells.
- $cell_regex = '@'.$cell_fragment.'@Usi';
- if (!preg_match_all($cell_regex, $content, $matches, PREG_SET_ORDER)) {
- throw new Exception(
- pht('Bug in Remarkup tables, parsing fails for input: %s', $text));
+ // If we have nonempty content between the rows, this isn't a valid
+ // table. We can't really do anything reasonable with this, so just
+ // fail out and render the raw text.
+ return $table->newRawString();
}
- $out_cells = array();
- foreach ($matches as $cell) {
- $cell_type = $cell[1];
- $cell_content = $cell[2];
+ if ($node->getTagName() === 'colgroup') {
+ // This table has multiple "<colgroup />" tags. Just bail out.
+ if ($colgroup !== null) {
+ return $table->newRawString();
+ }
- $out_cells[] = array(
- 'type' => $cell_type,
- 'content' => $this->applyRules($cell_content),
- );
+ // This table has a "<colgroup />" after a "<tr />". We could parse
+ // this, but just reject it out of an abundance of caution.
+ if ($rows) {
+ return $table->newRawString();
+ }
+
+ $colgroup = $node;
+ continue;
}
- $out_rows[] = array(
- 'type' => 'tr',
- 'content' => $out_cells,
- );
+ $rows[] = $node;
}
- return $this->renderRemarkupTable($out_rows);
- }
+ $row_specs = array();
- private function fail($near, $message) {
- $message = sprintf(
- '%s near: %s',
- $message,
- id(new PhutilUTF8StringTruncator())
- ->setMaximumGlyphs(32000)
- ->truncateString($near));
+ foreach ($rows as $row) {
+ $cells = $row->selectChildrenWithTags(array('td', 'th'));
- if ($this->getEngine()->isTextMode()) {
- return '('.$message.')';
+ $cell_specs = array();
+ foreach ($cells as $cell) {
+ if ($cell->isContentNode()) {
+ $content = $node->getContent();
+
+ if (!strlen(trim($content))) {
+ continue;
+ }
+
+ return $table->newRawString();
+ }
+
+ $content = $cell->getRawContentString();
+ $content = $this->applyRules($content);
+
+ $cell_specs[] = array(
+ 'type' => $cell->getTagName(),
+ 'content' => $content,
+ );
+ }
+
+ $row_specs[] = array(
+ 'type' => 'tr',
+ 'content' => $cell_specs,
+ );
}
- return hsprintf('<div style="color: red;">%s</div>', $message);
+ return $this->renderRemarkupTable($row_specs);
}
}
diff --git a/src/parser/html/PhutilDOMNode.php b/src/parser/html/PhutilDOMNode.php
new file mode 100644
--- /dev/null
+++ b/src/parser/html/PhutilDOMNode.php
@@ -0,0 +1,193 @@
+<?php
+
+final class PhutilDOMNode extends Phobject {
+
+ private $content;
+ private $tagName;
+ private $children = array();
+ private $attributes = array();
+ private $parentNode;
+ private $rawString;
+
+ public function setContent($content) {
+ $this->content = $content;
+ return $this;
+ }
+
+ public function getContent() {
+ return $this->content;
+ }
+
+ public function isContentNode() {
+ return ($this->content !== null);
+ }
+
+ public function setTagName($tag_name) {
+ $this->tagName = $tag_name;
+ return $this;
+ }
+
+ public function getTagName() {
+ return $this->tagName;
+ }
+
+ public function appendChild(PhutilDOMNode $node) {
+ $node->parentNode = $this;
+ $this->children[] = $node;
+ return $this;
+ }
+
+ public function getChildren() {
+ return $this->children;
+ }
+
+ public function getParentNode() {
+ return $this->parentNode;
+ }
+
+ public function setAttributes(array $attributes) {
+ $this->attributes = $attributes;
+ return $this;
+ }
+
+ public function getAttributes() {
+ return $this->attributes;
+ }
+
+ public function setRawString($raw_string) {
+ $this->rawString = $raw_string;
+ return $this;
+ }
+
+ public function getRawString() {
+ return $this->rawString;
+ }
+
+ public function toDictionary() {
+ if ($this->isContentNode()) {
+ return array(
+ 'content' => $this->content,
+ );
+ } else {
+ $children = array();
+
+ foreach ($this->getChildren() as $child) {
+ $children[] = $child->toDictionary();
+ }
+
+ return array(
+ 'tag' => $this->getTagName(),
+ 'attributes' => $this->getAttributes(),
+ 'children' => $children,
+ );
+ }
+ }
+
+ /**
+ * Get a list of the children of a given DOM node, treating unexpected
+ * tags as if they were raw content.
+ */
+ public function selectChildrenWithTags(array $tag_list) {
+ $tag_map = array_fuse($tag_list);
+
+ $nodes = array();
+ foreach ($this->getChildren() as $child) {
+ // If this is already a content node, just keep it as-is.
+ if ($child->isContentNode()) {
+ $nodes[] = $child;
+ continue;
+ }
+
+ $tag_name = $child->getTagName();
+
+ // If this is a tag that we're allowing, keep it as-is.
+ if (isset($tag_map[$tag_name])) {
+ $nodes[] = $child;
+ continue;
+ }
+
+ // Otherwise, this is some other tag. Convert it into a content
+ // node.
+
+ $raw_content = $child->getRawString();
+
+ $nodes[] = id(new self())
+ ->setContent($raw_content)
+ ->setRawContent($raw_content);
+ }
+
+ return $this->mergeContentNodes($nodes);
+ }
+
+ public function getRawContentString() {
+ $content_node = $this->selectChildrenWithTags(array());
+
+ if (!$content_node) {
+ return '';
+ }
+
+ return head($content_node)->getRawString();
+ }
+
+ public function mergeContent() {
+ $this->children = $this->mergeContentNodes($this->children);
+
+ foreach ($this->getChildren() as $child) {
+ $child->parentNode = $this;
+ $child->mergeContent();
+ }
+
+ return $this;
+ }
+
+ /**
+ * Given a list of nodes, combine sequences of multiple adjacent content
+ * nodes into single nodes.
+ */
+ private function mergeContentNodes(array $nodes) {
+ $list = array();
+ $content_block = array();
+ foreach ($this->getChildren() as $child) {
+ if ($child->isContentNode()) {
+ $content_block[] = $child;
+ continue;
+ }
+
+ $list[] = $content_block;
+ $content_block = array();
+
+ $list[] = $child;
+ }
+
+ $list[] = $content_block;
+
+ $results = array();
+ foreach ($list as $item) {
+ if (!is_array($item)) {
+ $results[] = $item;
+ continue;
+ }
+
+ if (!$item) {
+ continue;
+ }
+
+ $parts = array();
+ foreach ($item as $content_node) {
+ $parts[] = $content_node->getRawString();
+ }
+ $parts = implode('', $parts);
+
+ if (!strlen($parts)) {
+ continue;
+ }
+
+ $results[] = id(new self())
+ ->setContent($parts)
+ ->setRawString($parts);
+ }
+
+ return $results;
+ }
+
+}
diff --git a/src/parser/html/PhutilHTMLParser.php b/src/parser/html/PhutilHTMLParser.php
new file mode 100644
--- /dev/null
+++ b/src/parser/html/PhutilHTMLParser.php
@@ -0,0 +1,426 @@
+<?php
+
+final class PhutilHTMLParser extends Phobject {
+
+ private $cursor;
+
+ public function parseDocument($corpus) {
+ // Divide the block into sequences of "tag" and "non-tag" content. Tag
+ // content is anything between angle brackets ("<" and ">"). Non-tag
+ // content is anything else.
+
+ $segment_pos = 0;
+ $segments = array();
+ $in_tag = false;
+
+ for ($ii = 0; $ii < strlen($corpus); $ii++) {
+ $c = $corpus[$ii];
+
+ if ($in_tag && ($c === '>')) {
+ if ($segment_pos !== null) {
+ $segments[] = array(
+ 'tag' => $in_tag,
+ 'pos' => $segment_pos,
+ 'end' => $ii + 1,
+ );
+ }
+
+ $segment_pos = $ii + 1;
+ $in_tag = false;
+ continue;
+ }
+
+ if (!$in_tag && ($c === '<')) {
+ $segments[] = array(
+ 'tag' => $in_tag,
+ 'pos' => $segment_pos,
+ 'end' => $ii,
+ );
+
+ $segment_pos = $ii;
+ $in_tag = true;
+ continue;
+ }
+ }
+
+ // Add whatever content was left at the end of the string. If we were in
+ // a tag but did not find a closing ">", we treat this as normal content.
+ $segments[] = array(
+ 'tag' => false,
+ 'pos' => $segment_pos,
+ 'end' => $ii,
+ );
+
+ // Slice the marked segments out of the raw corpus so we get a list of
+ // "tag" strings and a list of "non-tag" strings.
+
+ $parts = array();
+ $corpus_length = strlen($corpus);
+ foreach ($segments as $segment) {
+ $tag = $segment['tag'];
+ $pos = $segment['pos'];
+ $len = $segment['end'] - $pos;
+
+ // If this is a tag, we'll drop the "<" at the beginning and the ">"
+ // at the end here.
+ if ($tag) {
+ $slice_pos = $pos + 1;
+ $slice_len = $len - 2;
+ } else {
+ $slice_pos = $pos;
+ $slice_len = $len;
+ }
+
+ if (($slice_pos < $corpus_length) && ($slice_len > 0)) {
+ $content = substr($corpus, $slice_pos, $slice_len);
+ } else {
+ $content = '';
+ }
+
+ $parts[] = array(
+ 'tag' => $tag,
+ 'pos' => $pos,
+ 'len' => $len,
+ 'content' => $content,
+ );
+ }
+
+ $root = new PhutilDOMNode();
+ $this->setCursor($root);
+
+ foreach ($parts as $part) {
+ $tag = $this->newTagDOMNode($part);
+
+ if ($tag !== null) {
+ continue;
+ }
+
+ $content = $part['content'];
+
+ // If this part is a tag, restore the angle brackets.
+ if ($part['tag']) {
+ $content = '<'.$content.'>';
+ }
+
+ $node = id(new PhutilDOMNode())
+ ->setContent($content)
+ ->setRawString($content);
+
+ $this->getCursor()->appendChild($node);
+ }
+
+ $root->mergeContent();
+
+ return $root;
+ }
+
+ private function newTagDOMNode(array $part) {
+ if (!$part['tag']) {
+ return null;
+ }
+
+ $raw_content = $part['content'];
+ $content = $raw_content;
+
+ $content = trim($content);
+ $content_len = strlen($content);
+
+ // If the tag content begins with "/", like "</td>", strip the slash
+ // off and mark this as a closing tag.
+ $is_close = false;
+ if ($content_len > 0 && $content[0] === '/') {
+ $is_close = true;
+ $content = substr($content, 1);
+ $content = trim($content);
+ $content_len = strlen($content);
+ }
+
+ // If the tag content ends with "/", like "<td />", strip the slash off
+ // and mark this as self-closing.
+ $self_close = false;
+ if ($content_len > 0 && $content[$content_len - 1] === '/') {
+ $self_close = true;
+ $content = substr($content, 0, $content_len - 1);
+ $content = trim($content);
+ $content_len = strlen($content);
+ }
+
+ // If this tag is both a closing tag and a self-closing tag, it is
+ // not formatted correctly. Treat it as content.
+ if ($self_close && $is_close) {
+ return null;
+ }
+
+ // Now, split the rest of the tag into the tag name and tag attributes.
+ $pieces = preg_split('/\s+/', $content, 2);
+ $tag_name = $pieces[0];
+
+ if (count($pieces) > 1) {
+ $attributes = $pieces[1];
+ } else {
+ $attributes = '';
+ }
+
+ // If there's no tag name, this tag is not valid. Treat it as content.
+ if (!strlen($tag_name)) {
+ return null;
+ }
+
+ // If this is a closing tag with attributes, it's not valid. Treat it
+ // as content.
+ if ($is_close && strlen($attributes)) {
+ return null;
+ }
+
+ $tag_name = phutil_utf8_strtolower($tag_name);
+
+ // If we find a valid closing tag, try to find a matching tag on the stack.
+ // If we find a matching tag, close it.
+ // If we do not find a matching tag, treat the closing tag as content.
+ if ($is_close) {
+ $cursor = $this->getCursor();
+
+ while ($cursor) {
+ if ($cursor->getTagName() === $tag_name) {
+ $parent = $cursor->getParentNode();
+ $this->setCursor($parent);
+ return true;
+ }
+ $cursor = $cursor->getParentNode();
+ }
+
+ return null;
+ }
+
+ if (strlen($attributes)) {
+ $attribute_map = $this->parseAttributes($attributes);
+ // If the attributes can't be parsed, treat the tag as content.
+ if ($attribute_map === null) {
+ return null;
+ }
+ } else {
+ $attribute_map = array();
+ }
+
+ $node = id(new PhutilDOMNode())
+ ->setTagName($tag_name)
+ ->setAttributes($attribute_map)
+ ->setRawString('<'.$raw_content.'>');
+
+ $cursor = $this->getCursor();
+ $cursor->appendChild($node);
+
+ if (!$self_close) {
+ $this->setCursor($node);
+ }
+
+ return $node;
+ }
+
+ private function setCursor(PhutilDOMNode $cursor) {
+ $this->cursor = $cursor;
+ return $this;
+ }
+
+ private function getCursor() {
+ return $this->cursor;
+ }
+
+ private function parseAttributes($attributes) {
+ $state = 'key';
+
+ $whitespace = array(
+ ' ' => true,
+ "\n" => true,
+ "\t" => true,
+ "\r" => true,
+ );
+
+ $map = array();
+ $len = strlen($attributes);
+ $key_pos = null;
+ for ($ii = 0; $ii < $len; $ii++) {
+ $c = $attributes[$ii];
+ $is_space = isset($whitespace[$c]);
+
+ switch ($state) {
+ case 'key':
+ // We're looking for the start of an attribute name.
+
+ // Skip over any whitespace.
+ if ($is_space) {
+ break;
+ }
+
+ // If we see "<tag =...", that isn't valid. Treat this tag as
+ // content.
+ if ($c === '=') {
+ return null;
+ }
+
+ // If we see a quotation mark with no attribute name, that isn't
+ // valid. Treat this tag as content.
+ if ($c === '"') {
+ return null;
+ }
+
+ // Any other character marks the beginning of an attribute name.
+ // Switch the parser state to "name" to parse the name.
+ $name_pos = $ii;
+ $state = 'name';
+ break;
+ case 'name':
+ // We're looking for the end of an attribute name.
+
+ // Finding a "=" or a space character ends the attribute name.
+ // Save it, then figure out what to do with the parser state.
+ if ($c === '=' || $is_space) {
+ $name_value = substr($attributes, $name_pos, $ii - $name_pos);
+ $name_value = phutil_utf8_strtolower($name_value);
+
+ // If this attribute already exists, the tag is invalid. This means
+ // the input is something like "<tag a=1 a=2>".
+ if (isset($map[$name_value])) {
+ return null;
+ }
+ }
+
+ // If we find an "=", that's the end of the name. Next, we're going
+ // to parse a value.
+ if ($c === '=') {
+ $state = 'value';
+ break;
+ }
+
+ // If we find whitespace, that's the end of the name. We're going
+ // to look for an "=".
+ if ($is_space) {
+ $state = 'equals';
+ break;
+ }
+
+ break;
+ case 'equals':
+ // We've parsed the name of an attribute and are looking for an
+ // "=" character.
+
+ // Skip over any whitespace.
+ if ($is_space) {
+ break;
+ }
+
+ // This is the "=" we're looking for, so we're good to go.
+ if ($c === '=') {
+ $state = 'value';
+ break;
+ }
+
+ // If this is anything else, this is an attribute name with no
+ // value. Treat it as "true" and move on. This corresponds to an
+ // input like "<input disabled>".
+ $map[$name_value] = true;
+ $name_pos = $ii;
+ $state = 'name';
+ break;
+ case 'value':
+ // We've parsed an "=" and are looking for the start of a value.
+
+ // Skip over any whitespace.
+ if ($is_space) {
+ break;
+ }
+
+ // Don't accept "<tag a==" to mean that key "a" has a value of
+ // "=", since this is silly. To specify a value beginning with "=",
+ // you have to quote it.
+ if ($c === '=') {
+ return null;
+ }
+
+ // Anything else is a value.
+ $value_pos = $ii;
+
+ // This is a quotation mark, so parse a quoted value.
+ if ($c === '"') {
+ $value_pos = $value_pos + 1;
+ $state = 'quoted';
+ } else {
+ $state = 'unquoted';
+ }
+ break;
+ case 'quoted':
+ // We've started parsing a quoted value, so look for the closing
+ // quote.
+
+ // We found the closing quote, so pull out the actual value.
+ if ($c === '"') {
+ $attr_value = substr($attributes, $value_pos, $ii - $value_pos);
+
+ $map[$name_value] = $attr_value;
+ $state = 'key';
+ break;
+ }
+
+ // Anything else is more text in the quoted value.
+ break;
+ case 'unquoted':
+ // We've started parsing an unquoted value, so look for terminating
+ // whitespace.
+
+ // We've found some whitespace, so pull out the actual value.
+ if ($is_space) {
+ $attr_value = substr($attributes, $value_pos, $ii - $value_pos);
+
+ $map[$name_value] = $attr_value;
+ $state = 'key';
+ break;
+ }
+
+ // Anything else is more text in the unquoted value.
+ break;
+ }
+ }
+
+ switch ($state) {
+ case 'key':
+ // We were looking for the start of an attribute name, so there's
+ // nothing to clean up.
+ break;
+ case 'name':
+ // We were looking for the end of an attribute name. Treat whatever
+ // we found as a name.
+ $name_value = substr($attributes, $name_pos, $len - $name_pos);
+
+ if (isset($map[$name_value])) {
+ return null;
+ }
+
+ $map[$name_value] = true;
+ break;
+ case 'equals':
+ case 'value':
+ // We found an attribute name followed by whitespace or an "=". Treat
+ // whatever we found as a valid attribute name with no value.
+
+ if (isset($map[$name_value])) {
+ return null;
+ }
+
+ $map[$name_value] = true;
+ break;
+ case 'quoted':
+ case 'unquoted':
+ // We were parsing a value but ran out of characters before we found
+ // the delimiter or closing quote. Treat whatever we found as a quoted
+ // value.
+
+ $attr_value = substr($attributes, $value_pos, $len - $name_pos);
+
+ $map[$name_value] = $attr_value;
+ break;
+ }
+
+ return $map;
+ }
+
+}
diff --git a/src/parser/html/__tests__/PhutilHTMLParserTestCase.php b/src/parser/html/__tests__/PhutilHTMLParserTestCase.php
new file mode 100644
--- /dev/null
+++ b/src/parser/html/__tests__/PhutilHTMLParserTestCase.php
@@ -0,0 +1,45 @@
+<?php
+
+final class PhutilHTMLParserTestCase
+ extends PhutilTestCase {
+
+ public function testHTMLParser() {
+
+ $root = dirname(__FILE__).'/data/';
+ $tests = Filesystem::listDirectory($root, $include_hidden = false);
+
+ foreach ($tests as $test) {
+ $path = $root.$test;
+ $data = Filesystem::readFile($path);
+
+ $parts = explode("\n~~~~~~~~~~\n", $data);
+ if (count($parts) !== 2) {
+ throw new Exception(
+ pht(
+ 'Expected "~~~~~~~~~~" delimiter in test "%s".',
+ $test));
+ }
+
+ $input = $parts[0];
+
+ $expect = $parts[1];
+ $expect = phutil_json_decode($parts[1]);
+
+ $document = id(new PhutilHTMLParser())
+ ->parseDocument($input);
+
+ // We're just testing the child list of the root node since this
+ // reduces the amount of boilerplate in the test cases.
+ $list = array();
+ foreach ($document->getChildren() as $child) {
+ $list[] = $child->toDictionary();
+ }
+
+ $this->assertEqual(
+ $expect,
+ $list,
+ pht('DOM tree for "%s".', $test));
+ }
+ }
+
+}
diff --git a/src/parser/html/__tests__/data/attributes-basic.txt b/src/parser/html/__tests__/data/attributes-basic.txt
new file mode 100644
--- /dev/null
+++ b/src/parser/html/__tests__/data/attributes-basic.txt
@@ -0,0 +1,13 @@
+<a b=1 c d="e" />
+~~~~~~~~~~
+[
+ {
+ "tag": "a",
+ "attributes": {
+ "b": "1",
+ "c": true,
+ "d": "e"
+ },
+ "children": []
+ }
+]
diff --git a/src/parser/html/__tests__/data/content-angle.txt b/src/parser/html/__tests__/data/content-angle.txt
new file mode 100644
--- /dev/null
+++ b/src/parser/html/__tests__/data/content-angle.txt
@@ -0,0 +1,7 @@
+o< quack
+~~~~~~~~~~
+[
+ {
+ "content": "o< quack"
+ }
+]
diff --git a/src/parser/html/__tests__/data/content-simple.txt b/src/parser/html/__tests__/data/content-simple.txt
new file mode 100644
--- /dev/null
+++ b/src/parser/html/__tests__/data/content-simple.txt
@@ -0,0 +1,7 @@
+quack
+~~~~~~~~~~
+[
+ {
+ "content": "quack"
+ }
+]
diff --git a/src/parser/html/__tests__/data/tag-mismatch.txt b/src/parser/html/__tests__/data/tag-mismatch.txt
new file mode 100644
--- /dev/null
+++ b/src/parser/html/__tests__/data/tag-mismatch.txt
@@ -0,0 +1,21 @@
+<a><b><c></b></a>
+~~~~~~~~~~
+[
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": [
+ {
+ "tag": "b",
+ "attributes": {},
+ "children": [
+ {
+ "tag": "c",
+ "attributes": {},
+ "children": []
+ }
+ ]
+ }
+ ]
+ }
+]
diff --git a/src/parser/html/__tests__/data/tag-simple.txt b/src/parser/html/__tests__/data/tag-simple.txt
new file mode 100644
--- /dev/null
+++ b/src/parser/html/__tests__/data/tag-simple.txt
@@ -0,0 +1,135 @@
+<a/>
+<a/ >
+< a/>
+<a />
+<a / >
+< a />
+< a / >
+<a></a>
+<a ></a >
+< a>< /a>
+< a >< /a >
+<a></ a>
+<a ></ a >
+< a>< / a>
+< a >< / a >
+~~~~~~~~~~
+[
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ }
+]
diff --git a/src/parser/html/__tests__/data/tag-table.txt b/src/parser/html/__tests__/data/tag-table.txt
new file mode 100644
--- /dev/null
+++ b/src/parser/html/__tests__/data/tag-table.txt
@@ -0,0 +1,39 @@
+<table><tr><td>a</td><td>b</td><td /></tr></table>
+~~~~~~~~~~
+[
+ {
+ "tag": "table",
+ "attributes": {},
+ "children": [
+ {
+ "tag": "tr",
+ "attributes": {},
+ "children": [
+ {
+ "tag": "td",
+ "attributes": {},
+ "children": [
+ {
+ "content": "a"
+ }
+ ]
+ },
+ {
+ "tag": "td",
+ "attributes": {},
+ "children": [
+ {
+ "content": "b"
+ }
+ ]
+ },
+ {
+ "tag": "td",
+ "attributes": {},
+ "children": []
+ }
+ ]
+ }
+ ]
+ }
+]

File Metadata

Mime Type
text/plain
Expires
Sat, May 11, 10:48 AM (2 w, 6 d ago)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
6274890
Default Alt Text
D20568.diff (29 KB)

Event Timeline