Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F15419599
D20568.id49067.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
29 KB
Referenced Files
None
Subscribers
None
D20568.id49067.diff
View Options
diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php
--- a/src/__phutil_library_map__.php
+++ b/src/__phutil_library_map__.php
@@ -203,6 +203,7 @@
'PhutilCowsayTestCase' => 'utils/__tests__/PhutilCowsayTestCase.php',
'PhutilCsprintfTestCase' => 'xsprintf/__tests__/PhutilCsprintfTestCase.php',
'PhutilCzechLocale' => 'internationalization/locales/PhutilCzechLocale.php',
+ 'PhutilDOMNode' => 'parser/html/PhutilDOMNode.php',
'PhutilDaemon' => 'daemon/PhutilDaemon.php',
'PhutilDaemonHandle' => 'daemon/PhutilDaemonHandle.php',
'PhutilDaemonOverseer' => 'daemon/PhutilDaemonOverseer.php',
@@ -260,6 +261,8 @@
'PhutilGitURI' => 'parser/PhutilGitURI.php',
'PhutilGitURITestCase' => 'parser/__tests__/PhutilGitURITestCase.php',
'PhutilGoogleAuthAdapter' => 'auth/PhutilGoogleAuthAdapter.php',
+ 'PhutilHTMLParser' => 'parser/html/PhutilHTMLParser.php',
+ 'PhutilHTMLParserTestCase' => 'parser/html/__tests__/PhutilHTMLParserTestCase.php',
'PhutilHTTPEngineExtension' => 'future/http/PhutilHTTPEngineExtension.php',
'PhutilHTTPResponse' => 'parser/http/PhutilHTTPResponse.php',
'PhutilHTTPResponseParser' => 'parser/http/PhutilHTTPResponseParser.php',
@@ -859,6 +862,7 @@
'PhutilCowsayTestCase' => 'PhutilTestCase',
'PhutilCsprintfTestCase' => 'PhutilTestCase',
'PhutilCzechLocale' => 'PhutilLocale',
+ 'PhutilDOMNode' => 'Phobject',
'PhutilDaemon' => 'Phobject',
'PhutilDaemonHandle' => 'Phobject',
'PhutilDaemonOverseer' => 'Phobject',
@@ -916,6 +920,8 @@
'PhutilGitURI' => 'Phobject',
'PhutilGitURITestCase' => 'PhutilTestCase',
'PhutilGoogleAuthAdapter' => 'PhutilOAuthAuthAdapter',
+ 'PhutilHTMLParser' => 'Phobject',
+ 'PhutilHTMLParserTestCase' => 'PhutilTestCase',
'PhutilHTTPEngineExtension' => 'Phobject',
'PhutilHTTPResponse' => 'Phobject',
'PhutilHTTPResponseParser' => 'Phobject',
diff --git a/src/markup/engine/remarkup/blockrule/PhutilRemarkupTableBlockRule.php b/src/markup/engine/remarkup/blockrule/PhutilRemarkupTableBlockRule.php
--- a/src/markup/engine/remarkup/blockrule/PhutilRemarkupTableBlockRule.php
+++ b/src/markup/engine/remarkup/blockrule/PhutilRemarkupTableBlockRule.php
@@ -11,7 +11,7 @@
while (isset($lines[$cursor])) {
$num_lines++;
- if (preg_match('@</table>$@i', $lines[$cursor])) {
+ if (preg_match('@</table>\s*$@i', $lines[$cursor])) {
break;
}
$cursor++;
@@ -22,86 +22,117 @@
}
public function markupText($text, $children) {
- $matches = array();
+ $root = id(new PhutilHTMLParser())
+ ->parseDocument($text);
- if (!preg_match('@^\s*<table>(.*)</table>$@si', $text, $matches)) {
- return $this->fail(
- $text,
- pht('Bad table (expected %s)', '<table>...</table>'));
- }
+ $nodes = $root->selectChildrenWithTags(array('table'));
- $body = $matches[1];
+ $out = array();
+ $seen_table = false;
+ foreach ($nodes as $node) {
+ if ($node->isContentNode()) {
+ $content = $node->getContent();
- $row_fragment = '(?:\s*<tr>(.*)</tr>\s*)';
- $cell_fragment = '(?:\s*<(td|th)>(.*)</(?:td|th)>\s*)';
+ if (!strlen(trim($content))) {
+ // Ignore whitespace.
+ continue;
+ }
- // Test that the body contains only valid rows.
- if (!preg_match('@^'.$row_fragment.'+$@Usi', $body)) {
- return $this->fail(
- $body,
- pht('Bad table syntax (expected rows %s)', '<tr>...</tr>'));
- }
+ // If we find other content, fail the rule. This can happen if the
+ // input is two consecutive table tags on one line with some text
+ // in between them, which we currently forbid.
+ return $text;
+ } else {
+ // If we have multiple table tags, just return the raw text.
+ if ($seen_table) {
+ return $text;
+ }
+ $seen_table = true;
- // Capture the rows.
- $row_regex = '@'.$row_fragment.'@Usi';
- if (!preg_match_all($row_regex, $body, $matches, PREG_SET_ORDER)) {
- throw new Exception(
- pht('Bug in Remarkup tables, parsing fails for input: %s', $text));
+ $out[] = $this->newTable($node);
+ }
}
- $out_rows = array();
+ return phutil_implode_html('', $out);
+ }
- $rows = $matches;
- foreach ($rows as $row) {
- $content = $row[1];
+ private function newTable(PhutilDOMNode $table) {
+ $nodes = $table->selectChildrenWithTags(
+ array(
+ 'colgroup',
+ 'tr',
+ ));
- // Test that the row contains only valid cells.
- if (!preg_match('@^'.$cell_fragment.'+$@Usi', $content)) {
- return $this->fail(
- $content,
- pht('Bad table syntax (expected cells %s)', '<td>...</td>'));
- }
+ $colgroup = null;
+ $rows = array();
+
+ foreach ($nodes as $node) {
+ if ($node->isContentNode()) {
+ $content = $node->getContent();
+
+ // If this is whitespace, ignore it.
+ if (!strlen(trim($content))) {
+ continue;
+ }
- // Capture the cells.
- $cell_regex = '@'.$cell_fragment.'@Usi';
- if (!preg_match_all($cell_regex, $content, $matches, PREG_SET_ORDER)) {
- throw new Exception(
- pht('Bug in Remarkup tables, parsing fails for input: %s', $text));
+ // If we have nonempty content between the rows, this isn't a valid
+ // table. We can't really do anything reasonable with this, so just
+ // fail out and render the raw text.
+ return $table->newRawString();
}
- $out_cells = array();
- foreach ($matches as $cell) {
- $cell_type = $cell[1];
- $cell_content = $cell[2];
+ if ($node->getTagName() === 'colgroup') {
+ // This table has multiple "<colgroup />" tags. Just bail out.
+ if ($colgroup !== null) {
+ return $table->newRawString();
+ }
- $out_cells[] = array(
- 'type' => $cell_type,
- 'content' => $this->applyRules($cell_content),
- );
+ // This table has a "<colgroup />" after a "<tr />". We could parse
+ // this, but just reject it out of an abundance of caution.
+ if ($rows) {
+ return $table->newRawString();
+ }
+
+ $colgroup = $node;
+ continue;
}
- $out_rows[] = array(
- 'type' => 'tr',
- 'content' => $out_cells,
- );
+ $rows[] = $node;
}
- return $this->renderRemarkupTable($out_rows);
- }
+ $row_specs = array();
- private function fail($near, $message) {
- $message = sprintf(
- '%s near: %s',
- $message,
- id(new PhutilUTF8StringTruncator())
- ->setMaximumGlyphs(32000)
- ->truncateString($near));
+ foreach ($rows as $row) {
+ $cells = $row->selectChildrenWithTags(array('td', 'th'));
- if ($this->getEngine()->isTextMode()) {
- return '('.$message.')';
+ $cell_specs = array();
+ foreach ($cells as $cell) {
+ if ($cell->isContentNode()) {
+ $content = $node->getContent();
+
+ if (!strlen(trim($content))) {
+ continue;
+ }
+
+ return $table->newRawString();
+ }
+
+ $content = $cell->getRawContentString();
+ $content = $this->applyRules($content);
+
+ $cell_specs[] = array(
+ 'type' => $cell->getTagName(),
+ 'content' => $content,
+ );
+ }
+
+ $row_specs[] = array(
+ 'type' => 'tr',
+ 'content' => $cell_specs,
+ );
}
- return hsprintf('<div style="color: red;">%s</div>', $message);
+ return $this->renderRemarkupTable($row_specs);
}
}
diff --git a/src/parser/html/PhutilDOMNode.php b/src/parser/html/PhutilDOMNode.php
new file mode 100644
--- /dev/null
+++ b/src/parser/html/PhutilDOMNode.php
@@ -0,0 +1,193 @@
+<?php
+
+final class PhutilDOMNode extends Phobject {
+
+ private $content;
+ private $tagName;
+ private $children = array();
+ private $attributes = array();
+ private $parentNode;
+ private $rawString;
+
+ public function setContent($content) {
+ $this->content = $content;
+ return $this;
+ }
+
+ public function getContent() {
+ return $this->content;
+ }
+
+ public function isContentNode() {
+ return ($this->content !== null);
+ }
+
+ public function setTagName($tag_name) {
+ $this->tagName = $tag_name;
+ return $this;
+ }
+
+ public function getTagName() {
+ return $this->tagName;
+ }
+
+ public function appendChild(PhutilDOMNode $node) {
+ $node->parentNode = $this;
+ $this->children[] = $node;
+ return $this;
+ }
+
+ public function getChildren() {
+ return $this->children;
+ }
+
+ public function getParentNode() {
+ return $this->parentNode;
+ }
+
+ public function setAttributes(array $attributes) {
+ $this->attributes = $attributes;
+ return $this;
+ }
+
+ public function getAttributes() {
+ return $this->attributes;
+ }
+
+ public function setRawString($raw_string) {
+ $this->rawString = $raw_string;
+ return $this;
+ }
+
+ public function getRawString() {
+ return $this->rawString;
+ }
+
+ public function toDictionary() {
+ if ($this->isContentNode()) {
+ return array(
+ 'content' => $this->content,
+ );
+ } else {
+ $children = array();
+
+ foreach ($this->getChildren() as $child) {
+ $children[] = $child->toDictionary();
+ }
+
+ return array(
+ 'tag' => $this->getTagName(),
+ 'attributes' => $this->getAttributes(),
+ 'children' => $children,
+ );
+ }
+ }
+
+ /**
+ * Get a list of the children of a given DOM node, treating unexpected
+ * tags as if they were raw content.
+ */
+ public function selectChildrenWithTags(array $tag_list) {
+ $tag_map = array_fuse($tag_list);
+
+ $nodes = array();
+ foreach ($this->getChildren() as $child) {
+ // If this is already a content node, just keep it as-is.
+ if ($child->isContentNode()) {
+ $nodes[] = $child;
+ continue;
+ }
+
+ $tag_name = $child->getTagName();
+
+ // If this is a tag that we're allowing, keep it as-is.
+ if (isset($tag_map[$tag_name])) {
+ $nodes[] = $child;
+ continue;
+ }
+
+ // Otherwise, this is some other tag. Convert it into a content
+ // node.
+
+ $raw_content = $child->getRawString();
+
+ $nodes[] = id(new self())
+ ->setContent($raw_content)
+ ->setRawContent($raw_content);
+ }
+
+ return $this->mergeContentNodes($nodes);
+ }
+
+ public function getRawContentString() {
+ $content_node = $this->selectChildrenWithTags(array());
+
+ if (!$content_node) {
+ return '';
+ }
+
+ return head($content_node)->getRawString();
+ }
+
+ public function mergeContent() {
+ $this->children = $this->mergeContentNodes($this->children);
+
+ foreach ($this->getChildren() as $child) {
+ $child->parentNode = $this;
+ $child->mergeContent();
+ }
+
+ return $this;
+ }
+
+ /**
+ * Given a list of nodes, combine sequences of multiple adjacent content
+ * nodes into single nodes.
+ */
+ private function mergeContentNodes(array $nodes) {
+ $list = array();
+ $content_block = array();
+ foreach ($this->getChildren() as $child) {
+ if ($child->isContentNode()) {
+ $content_block[] = $child;
+ continue;
+ }
+
+ $list[] = $content_block;
+ $content_block = array();
+
+ $list[] = $child;
+ }
+
+ $list[] = $content_block;
+
+ $results = array();
+ foreach ($list as $item) {
+ if (!is_array($item)) {
+ $results[] = $item;
+ continue;
+ }
+
+ if (!$item) {
+ continue;
+ }
+
+ $parts = array();
+ foreach ($item as $content_node) {
+ $parts[] = $content_node->getRawString();
+ }
+ $parts = implode('', $parts);
+
+ if (!strlen($parts)) {
+ continue;
+ }
+
+ $results[] = id(new self())
+ ->setContent($parts)
+ ->setRawString($parts);
+ }
+
+ return $results;
+ }
+
+}
diff --git a/src/parser/html/PhutilHTMLParser.php b/src/parser/html/PhutilHTMLParser.php
new file mode 100644
--- /dev/null
+++ b/src/parser/html/PhutilHTMLParser.php
@@ -0,0 +1,426 @@
+<?php
+
+final class PhutilHTMLParser extends Phobject {
+
+ private $cursor;
+
+ public function parseDocument($corpus) {
+ // Divide the block into sequences of "tag" and "non-tag" content. Tag
+ // content is anything between angle brackets ("<" and ">"). Non-tag
+ // content is anything else.
+
+ $segment_pos = 0;
+ $segments = array();
+ $in_tag = false;
+
+ for ($ii = 0; $ii < strlen($corpus); $ii++) {
+ $c = $corpus[$ii];
+
+ if ($in_tag && ($c === '>')) {
+ if ($segment_pos !== null) {
+ $segments[] = array(
+ 'tag' => $in_tag,
+ 'pos' => $segment_pos,
+ 'end' => $ii + 1,
+ );
+ }
+
+ $segment_pos = $ii + 1;
+ $in_tag = false;
+ continue;
+ }
+
+ if (!$in_tag && ($c === '<')) {
+ $segments[] = array(
+ 'tag' => $in_tag,
+ 'pos' => $segment_pos,
+ 'end' => $ii,
+ );
+
+ $segment_pos = $ii;
+ $in_tag = true;
+ continue;
+ }
+ }
+
+ // Add whatever content was left at the end of the string. If we were in
+ // a tag but did not find a closing ">", we treat this as normal content.
+ $segments[] = array(
+ 'tag' => false,
+ 'pos' => $segment_pos,
+ 'end' => $ii,
+ );
+
+ // Slice the marked segments out of the raw corpus so we get a list of
+ // "tag" strings and a list of "non-tag" strings.
+
+ $parts = array();
+ $corpus_length = strlen($corpus);
+ foreach ($segments as $segment) {
+ $tag = $segment['tag'];
+ $pos = $segment['pos'];
+ $len = $segment['end'] - $pos;
+
+ // If this is a tag, we'll drop the "<" at the beginning and the ">"
+ // at the end here.
+ if ($tag) {
+ $slice_pos = $pos + 1;
+ $slice_len = $len - 2;
+ } else {
+ $slice_pos = $pos;
+ $slice_len = $len;
+ }
+
+ if (($slice_pos < $corpus_length) && ($slice_len > 0)) {
+ $content = substr($corpus, $slice_pos, $slice_len);
+ } else {
+ $content = '';
+ }
+
+ $parts[] = array(
+ 'tag' => $tag,
+ 'pos' => $pos,
+ 'len' => $len,
+ 'content' => $content,
+ );
+ }
+
+ $root = new PhutilDOMNode();
+ $this->setCursor($root);
+
+ foreach ($parts as $part) {
+ $tag = $this->newTagDOMNode($part);
+
+ if ($tag !== null) {
+ continue;
+ }
+
+ $content = $part['content'];
+
+ // If this part is a tag, restore the angle brackets.
+ if ($part['tag']) {
+ $content = '<'.$content.'>';
+ }
+
+ $node = id(new PhutilDOMNode())
+ ->setContent($content)
+ ->setRawString($content);
+
+ $this->getCursor()->appendChild($node);
+ }
+
+ $root->mergeContent();
+
+ return $root;
+ }
+
+ private function newTagDOMNode(array $part) {
+ if (!$part['tag']) {
+ return null;
+ }
+
+ $raw_content = $part['content'];
+ $content = $raw_content;
+
+ $content = trim($content);
+ $content_len = strlen($content);
+
+ // If the tag content begins with "/", like "</td>", strip the slash
+ // off and mark this as a closing tag.
+ $is_close = false;
+ if ($content_len > 0 && $content[0] === '/') {
+ $is_close = true;
+ $content = substr($content, 1);
+ $content = trim($content);
+ $content_len = strlen($content);
+ }
+
+ // If the tag content ends with "/", like "<td />", strip the slash off
+ // and mark this as self-closing.
+ $self_close = false;
+ if ($content_len > 0 && $content[$content_len - 1] === '/') {
+ $self_close = true;
+ $content = substr($content, 0, $content_len - 1);
+ $content = trim($content);
+ $content_len = strlen($content);
+ }
+
+ // If this tag is both a closing tag and a self-closing tag, it is
+ // not formatted correctly. Treat it as content.
+ if ($self_close && $is_close) {
+ return null;
+ }
+
+ // Now, split the rest of the tag into the tag name and tag attributes.
+ $pieces = preg_split('/\s+/', $content, 2);
+ $tag_name = $pieces[0];
+
+ if (count($pieces) > 1) {
+ $attributes = $pieces[1];
+ } else {
+ $attributes = '';
+ }
+
+ // If there's no tag name, this tag is not valid. Treat it as content.
+ if (!strlen($tag_name)) {
+ return null;
+ }
+
+ // If this is a closing tag with attributes, it's not valid. Treat it
+ // as content.
+ if ($is_close && strlen($attributes)) {
+ return null;
+ }
+
+ $tag_name = phutil_utf8_strtolower($tag_name);
+
+ // If we find a valid closing tag, try to find a matching tag on the stack.
+ // If we find a matching tag, close it.
+ // If we do not find a matching tag, treat the closing tag as content.
+ if ($is_close) {
+ $cursor = $this->getCursor();
+
+ while ($cursor) {
+ if ($cursor->getTagName() === $tag_name) {
+ $parent = $cursor->getParentNode();
+ $this->setCursor($parent);
+ return true;
+ }
+ $cursor = $cursor->getParentNode();
+ }
+
+ return null;
+ }
+
+ if (strlen($attributes)) {
+ $attribute_map = $this->parseAttributes($attributes);
+ // If the attributes can't be parsed, treat the tag as content.
+ if ($attribute_map === null) {
+ return null;
+ }
+ } else {
+ $attribute_map = array();
+ }
+
+ $node = id(new PhutilDOMNode())
+ ->setTagName($tag_name)
+ ->setAttributes($attribute_map)
+ ->setRawString('<'.$raw_content.'>');
+
+ $cursor = $this->getCursor();
+ $cursor->appendChild($node);
+
+ if (!$self_close) {
+ $this->setCursor($node);
+ }
+
+ return $node;
+ }
+
+ private function setCursor(PhutilDOMNode $cursor) {
+ $this->cursor = $cursor;
+ return $this;
+ }
+
+ private function getCursor() {
+ return $this->cursor;
+ }
+
+ private function parseAttributes($attributes) {
+ $state = 'key';
+
+ $whitespace = array(
+ ' ' => true,
+ "\n" => true,
+ "\t" => true,
+ "\r" => true,
+ );
+
+ $map = array();
+ $len = strlen($attributes);
+ $key_pos = null;
+ for ($ii = 0; $ii < $len; $ii++) {
+ $c = $attributes[$ii];
+ $is_space = isset($whitespace[$c]);
+
+ switch ($state) {
+ case 'key':
+ // We're looking for the start of an attribute name.
+
+ // Skip over any whitespace.
+ if ($is_space) {
+ break;
+ }
+
+ // If we see "<tag =...", that isn't valid. Treat this tag as
+ // content.
+ if ($c === '=') {
+ return null;
+ }
+
+ // If we see a quotation mark with no attribute name, that isn't
+ // valid. Treat this tag as content.
+ if ($c === '"') {
+ return null;
+ }
+
+ // Any other character marks the beginning of an attribute name.
+ // Switch the parser state to "name" to parse the name.
+ $name_pos = $ii;
+ $state = 'name';
+ break;
+ case 'name':
+ // We're looking for the end of an attribute name.
+
+ // Finding a "=" or a space character ends the attribute name.
+ // Save it, then figure out what to do with the parser state.
+ if ($c === '=' || $is_space) {
+ $name_value = substr($attributes, $name_pos, $ii - $name_pos);
+ $name_value = phutil_utf8_strtolower($name_value);
+
+ // If this attribute already exists, the tag is invalid. This means
+ // the input is something like "<tag a=1 a=2>".
+ if (isset($map[$name_value])) {
+ return null;
+ }
+ }
+
+ // If we find an "=", that's the end of the name. Next, we're going
+ // to parse a value.
+ if ($c === '=') {
+ $state = 'value';
+ break;
+ }
+
+ // If we find whitespace, that's the end of the name. We're going
+ // to look for an "=".
+ if ($is_space) {
+ $state = 'equals';
+ break;
+ }
+
+ break;
+ case 'equals':
+ // We've parsed the name of an attribute and are looking for an
+ // "=" character.
+
+ // Skip over any whitespace.
+ if ($is_space) {
+ break;
+ }
+
+ // This is the "=" we're looking for, so we're good to go.
+ if ($c === '=') {
+ $state = 'value';
+ break;
+ }
+
+ // If this is anything else, this is an attribute name with no
+ // value. Treat it as "true" and move on. This corresponds to an
+ // input like "<input disabled>".
+ $map[$name_value] = true;
+ $name_pos = $ii;
+ $state = 'name';
+ break;
+ case 'value':
+ // We've parsed an "=" and are looking for the start of a value.
+
+ // Skip over any whitespace.
+ if ($is_space) {
+ break;
+ }
+
+ // Don't accept "<tag a==" to mean that key "a" has a value of
+ // "=", since this is silly. To specify a value beginning with "=",
+ // you have to quote it.
+ if ($c === '=') {
+ return null;
+ }
+
+ // Anything else is a value.
+ $value_pos = $ii;
+
+ // This is a quotation mark, so parse a quoted value.
+ if ($c === '"') {
+ $value_pos = $value_pos + 1;
+ $state = 'quoted';
+ } else {
+ $state = 'unquoted';
+ }
+ break;
+ case 'quoted':
+ // We've started parsing a quoted value, so look for the closing
+ // quote.
+
+ // We found the closing quote, so pull out the actual value.
+ if ($c === '"') {
+ $attr_value = substr($attributes, $value_pos, $ii - $value_pos);
+
+ $map[$name_value] = $attr_value;
+ $state = 'key';
+ break;
+ }
+
+ // Anything else is more text in the quoted value.
+ break;
+ case 'unquoted':
+ // We've started parsing an unquoted value, so look for terminating
+ // whitespace.
+
+ // We've found some whitespace, so pull out the actual value.
+ if ($is_space) {
+ $attr_value = substr($attributes, $value_pos, $ii - $value_pos);
+
+ $map[$name_value] = $attr_value;
+ $state = 'key';
+ break;
+ }
+
+ // Anything else is more text in the unquoted value.
+ break;
+ }
+ }
+
+ switch ($state) {
+ case 'key':
+ // We were looking for the start of an attribute name, so there's
+ // nothing to clean up.
+ break;
+ case 'name':
+ // We were looking for the end of an attribute name. Treat whatever
+ // we found as a name.
+ $name_value = substr($attributes, $name_pos, $len - $name_pos);
+
+ if (isset($map[$name_value])) {
+ return null;
+ }
+
+ $map[$name_value] = true;
+ break;
+ case 'equals':
+ case 'value':
+ // We found an attribute name followed by whitespace or an "=". Treat
+ // whatever we found as a valid attribute name with no value.
+
+ if (isset($map[$name_value])) {
+ return null;
+ }
+
+ $map[$name_value] = true;
+ break;
+ case 'quoted':
+ case 'unquoted':
+ // We were parsing a value but ran out of characters before we found
+ // the delimiter or closing quote. Treat whatever we found as a quoted
+ // value.
+
+ $attr_value = substr($attributes, $value_pos, $len - $name_pos);
+
+ $map[$name_value] = $attr_value;
+ break;
+ }
+
+ return $map;
+ }
+
+}
diff --git a/src/parser/html/__tests__/PhutilHTMLParserTestCase.php b/src/parser/html/__tests__/PhutilHTMLParserTestCase.php
new file mode 100644
--- /dev/null
+++ b/src/parser/html/__tests__/PhutilHTMLParserTestCase.php
@@ -0,0 +1,45 @@
+<?php
+
+final class PhutilHTMLParserTestCase
+ extends PhutilTestCase {
+
+ public function testHTMLParser() {
+
+ $root = dirname(__FILE__).'/data/';
+ $tests = Filesystem::listDirectory($root, $include_hidden = false);
+
+ foreach ($tests as $test) {
+ $path = $root.$test;
+ $data = Filesystem::readFile($path);
+
+ $parts = explode("\n~~~~~~~~~~\n", $data);
+ if (count($parts) !== 2) {
+ throw new Exception(
+ pht(
+ 'Expected "~~~~~~~~~~" delimiter in test "%s".',
+ $test));
+ }
+
+ $input = $parts[0];
+
+ $expect = $parts[1];
+ $expect = phutil_json_decode($parts[1]);
+
+ $document = id(new PhutilHTMLParser())
+ ->parseDocument($input);
+
+ // We're just testing the child list of the root node since this
+ // reduces the amount of boilerplate in the test cases.
+ $list = array();
+ foreach ($document->getChildren() as $child) {
+ $list[] = $child->toDictionary();
+ }
+
+ $this->assertEqual(
+ $expect,
+ $list,
+ pht('DOM tree for "%s".', $test));
+ }
+ }
+
+}
diff --git a/src/parser/html/__tests__/data/attributes-basic.txt b/src/parser/html/__tests__/data/attributes-basic.txt
new file mode 100644
--- /dev/null
+++ b/src/parser/html/__tests__/data/attributes-basic.txt
@@ -0,0 +1,13 @@
+<a b=1 c d="e" />
+~~~~~~~~~~
+[
+ {
+ "tag": "a",
+ "attributes": {
+ "b": "1",
+ "c": true,
+ "d": "e"
+ },
+ "children": []
+ }
+]
diff --git a/src/parser/html/__tests__/data/content-angle.txt b/src/parser/html/__tests__/data/content-angle.txt
new file mode 100644
--- /dev/null
+++ b/src/parser/html/__tests__/data/content-angle.txt
@@ -0,0 +1,7 @@
+o< quack
+~~~~~~~~~~
+[
+ {
+ "content": "o< quack"
+ }
+]
diff --git a/src/parser/html/__tests__/data/content-simple.txt b/src/parser/html/__tests__/data/content-simple.txt
new file mode 100644
--- /dev/null
+++ b/src/parser/html/__tests__/data/content-simple.txt
@@ -0,0 +1,7 @@
+quack
+~~~~~~~~~~
+[
+ {
+ "content": "quack"
+ }
+]
diff --git a/src/parser/html/__tests__/data/tag-mismatch.txt b/src/parser/html/__tests__/data/tag-mismatch.txt
new file mode 100644
--- /dev/null
+++ b/src/parser/html/__tests__/data/tag-mismatch.txt
@@ -0,0 +1,21 @@
+<a><b><c></b></a>
+~~~~~~~~~~
+[
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": [
+ {
+ "tag": "b",
+ "attributes": {},
+ "children": [
+ {
+ "tag": "c",
+ "attributes": {},
+ "children": []
+ }
+ ]
+ }
+ ]
+ }
+]
diff --git a/src/parser/html/__tests__/data/tag-simple.txt b/src/parser/html/__tests__/data/tag-simple.txt
new file mode 100644
--- /dev/null
+++ b/src/parser/html/__tests__/data/tag-simple.txt
@@ -0,0 +1,135 @@
+<a/>
+<a/ >
+< a/>
+<a />
+<a / >
+< a />
+< a / >
+<a></a>
+<a ></a >
+< a>< /a>
+< a >< /a >
+<a></ a>
+<a ></ a >
+< a>< / a>
+< a >< / a >
+~~~~~~~~~~
+[
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ },
+ {
+ "content": "\n"
+ },
+ {
+ "tag": "a",
+ "attributes": {},
+ "children": []
+ }
+]
diff --git a/src/parser/html/__tests__/data/tag-table.txt b/src/parser/html/__tests__/data/tag-table.txt
new file mode 100644
--- /dev/null
+++ b/src/parser/html/__tests__/data/tag-table.txt
@@ -0,0 +1,39 @@
+<table><tr><td>a</td><td>b</td><td /></tr></table>
+~~~~~~~~~~
+[
+ {
+ "tag": "table",
+ "attributes": {},
+ "children": [
+ {
+ "tag": "tr",
+ "attributes": {},
+ "children": [
+ {
+ "tag": "td",
+ "attributes": {},
+ "children": [
+ {
+ "content": "a"
+ }
+ ]
+ },
+ {
+ "tag": "td",
+ "attributes": {},
+ "children": [
+ {
+ "content": "b"
+ }
+ ]
+ },
+ {
+ "tag": "td",
+ "attributes": {},
+ "children": []
+ }
+ ]
+ }
+ ]
+ }
+]
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sat, Mar 22, 7:16 AM (3 d, 6 h ago)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
7382398
Default Alt Text
D20568.id49067.diff (29 KB)
Attached To
Mode
D20568: Parse remarkup tables with something like a real parser instead of regular expressions
Attached
Detach File
Event Timeline
Log In to Comment