D20568.id49067.diff
No OneTemporary
Actions

Size

29 KB

Referenced Files

None

Subscribers

None

D20568.id49067.diff
View Options

	diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php
	--- a/src/__phutil_library_map__.php
	+++ b/src/__phutil_library_map__.php
	@@ -203,6 +203,7 @@
	'PhutilCowsayTestCase' => 'utils/__tests__/PhutilCowsayTestCase.php',
	'PhutilCsprintfTestCase' => 'xsprintf/__tests__/PhutilCsprintfTestCase.php',
	'PhutilCzechLocale' => 'internationalization/locales/PhutilCzechLocale.php',
	+ 'PhutilDOMNode' => 'parser/html/PhutilDOMNode.php',
	'PhutilDaemon' => 'daemon/PhutilDaemon.php',
	'PhutilDaemonHandle' => 'daemon/PhutilDaemonHandle.php',
	'PhutilDaemonOverseer' => 'daemon/PhutilDaemonOverseer.php',
	@@ -260,6 +261,8 @@
	'PhutilGitURI' => 'parser/PhutilGitURI.php',
	'PhutilGitURITestCase' => 'parser/__tests__/PhutilGitURITestCase.php',
	'PhutilGoogleAuthAdapter' => 'auth/PhutilGoogleAuthAdapter.php',
	+ 'PhutilHTMLParser' => 'parser/html/PhutilHTMLParser.php',
	+ 'PhutilHTMLParserTestCase' => 'parser/html/__tests__/PhutilHTMLParserTestCase.php',
	'PhutilHTTPEngineExtension' => 'future/http/PhutilHTTPEngineExtension.php',
	'PhutilHTTPResponse' => 'parser/http/PhutilHTTPResponse.php',
	'PhutilHTTPResponseParser' => 'parser/http/PhutilHTTPResponseParser.php',
	@@ -859,6 +862,7 @@
	'PhutilCowsayTestCase' => 'PhutilTestCase',
	'PhutilCsprintfTestCase' => 'PhutilTestCase',
	'PhutilCzechLocale' => 'PhutilLocale',
	+ 'PhutilDOMNode' => 'Phobject',
	'PhutilDaemon' => 'Phobject',
	'PhutilDaemonHandle' => 'Phobject',
	'PhutilDaemonOverseer' => 'Phobject',
	@@ -916,6 +920,8 @@
	'PhutilGitURI' => 'Phobject',
	'PhutilGitURITestCase' => 'PhutilTestCase',
	'PhutilGoogleAuthAdapter' => 'PhutilOAuthAuthAdapter',
	+ 'PhutilHTMLParser' => 'Phobject',
	+ 'PhutilHTMLParserTestCase' => 'PhutilTestCase',
	'PhutilHTTPEngineExtension' => 'Phobject',
	'PhutilHTTPResponse' => 'Phobject',
	'PhutilHTTPResponseParser' => 'Phobject',
	diff --git a/src/markup/engine/remarkup/blockrule/PhutilRemarkupTableBlockRule.php b/src/markup/engine/remarkup/blockrule/PhutilRemarkupTableBlockRule.php
	--- a/src/markup/engine/remarkup/blockrule/PhutilRemarkupTableBlockRule.php
	+++ b/src/markup/engine/remarkup/blockrule/PhutilRemarkupTableBlockRule.php
	@@ -11,7 +11,7 @@

	while (isset($lines[$cursor])) {
	$num_lines++;
	- if (preg_match('@</table>$@i', $lines[$cursor])) {
	+ if (preg_match('@</table>\s*$@i', $lines[$cursor])) {
	break;
	}
	$cursor++;
	@@ -22,86 +22,117 @@
	}

	public function markupText($text, $children) {
	- $matches = array();
	+ $root = id(new PhutilHTMLParser())
	+ ->parseDocument($text);

	- if (!preg_match('@^\s<table>(.)</table>$@si', $text, $matches)) {
	- return $this->fail(
	- $text,
	- pht('Bad table (expected %s)', '<table>...</table>'));
	- }
	+ $nodes = $root->selectChildrenWithTags(array('table'));

	- $body = $matches[1];
	+ $out = array();
	+ $seen_table = false;
	+ foreach ($nodes as $node) {
	+ if ($node->isContentNode()) {
	+ $content = $node->getContent();

	- $row_fragment = '(?:\s<tr>(.)</tr>\s*)';
	- $cell_fragment = '(?:\s<(td\|th)>(.)</(?:td\|th)>\s*)';
	+ if (!strlen(trim($content))) {
	+ // Ignore whitespace.
	+ continue;
	+ }

	- // Test that the body contains only valid rows.
	- if (!preg_match('@^'.$row_fragment.'+$@Usi', $body)) {
	- return $this->fail(
	- $body,
	- pht('Bad table syntax (expected rows %s)', '<tr>...</tr>'));
	- }
	+ // If we find other content, fail the rule. This can happen if the
	+ // input is two consecutive table tags on one line with some text
	+ // in between them, which we currently forbid.
	+ return $text;
	+ } else {
	+ // If we have multiple table tags, just return the raw text.
	+ if ($seen_table) {
	+ return $text;
	+ }
	+ $seen_table = true;

	- // Capture the rows.
	- $row_regex = '@'.$row_fragment.'@Usi';
	- if (!preg_match_all($row_regex, $body, $matches, PREG_SET_ORDER)) {
	- throw new Exception(
	- pht('Bug in Remarkup tables, parsing fails for input: %s', $text));
	+ $out[] = $this->newTable($node);
	+ }
	}

	- $out_rows = array();
	+ return phutil_implode_html('', $out);
	+ }

	- $rows = $matches;
	- foreach ($rows as $row) {
	- $content = $row[1];
	+ private function newTable(PhutilDOMNode $table) {
	+ $nodes = $table->selectChildrenWithTags(
	+ array(
	+ 'colgroup',
	+ 'tr',
	+ ));

	- // Test that the row contains only valid cells.
	- if (!preg_match('@^'.$cell_fragment.'+$@Usi', $content)) {
	- return $this->fail(
	- $content,
	- pht('Bad table syntax (expected cells %s)', '<td>...</td>'));
	- }
	+ $colgroup = null;
	+ $rows = array();
	+
	+ foreach ($nodes as $node) {
	+ if ($node->isContentNode()) {
	+ $content = $node->getContent();
	+
	+ // If this is whitespace, ignore it.
	+ if (!strlen(trim($content))) {
	+ continue;
	+ }

	- // Capture the cells.
	- $cell_regex = '@'.$cell_fragment.'@Usi';
	- if (!preg_match_all($cell_regex, $content, $matches, PREG_SET_ORDER)) {
	- throw new Exception(
	- pht('Bug in Remarkup tables, parsing fails for input: %s', $text));
	+ // If we have nonempty content between the rows, this isn't a valid
	+ // table. We can't really do anything reasonable with this, so just
	+ // fail out and render the raw text.
	+ return $table->newRawString();
	}

	- $out_cells = array();
	- foreach ($matches as $cell) {
	- $cell_type = $cell[1];
	- $cell_content = $cell[2];
	+ if ($node->getTagName() === 'colgroup') {
	+ // This table has multiple "<colgroup />" tags. Just bail out.
	+ if ($colgroup !== null) {
	+ return $table->newRawString();
	+ }

	- $out_cells[] = array(
	- 'type' => $cell_type,
	- 'content' => $this->applyRules($cell_content),
	- );
	+ // This table has a "<colgroup />" after a "<tr />". We could parse
	+ // this, but just reject it out of an abundance of caution.
	+ if ($rows) {
	+ return $table->newRawString();
	+ }
	+
	+ $colgroup = $node;
	+ continue;
	}

	- $out_rows[] = array(
	- 'type' => 'tr',
	- 'content' => $out_cells,
	- );
	+ $rows[] = $node;
	}

	- return $this->renderRemarkupTable($out_rows);
	- }
	+ $row_specs = array();

	- private function fail($near, $message) {
	- $message = sprintf(
	- '%s near: %s',
	- $message,
	- id(new PhutilUTF8StringTruncator())
	- ->setMaximumGlyphs(32000)
	- ->truncateString($near));
	+ foreach ($rows as $row) {
	+ $cells = $row->selectChildrenWithTags(array('td', 'th'));

	- if ($this->getEngine()->isTextMode()) {
	- return '('.$message.')';
	+ $cell_specs = array();
	+ foreach ($cells as $cell) {
	+ if ($cell->isContentNode()) {
	+ $content = $node->getContent();
	+
	+ if (!strlen(trim($content))) {
	+ continue;
	+ }
	+
	+ return $table->newRawString();
	+ }
	+
	+ $content = $cell->getRawContentString();
	+ $content = $this->applyRules($content);
	+
	+ $cell_specs[] = array(
	+ 'type' => $cell->getTagName(),
	+ 'content' => $content,
	+ );
	+ }
	+
	+ $row_specs[] = array(
	+ 'type' => 'tr',
	+ 'content' => $cell_specs,
	+ );
	}

	- return hsprintf('<div style="color: red;">%s</div>', $message);
	+ return $this->renderRemarkupTable($row_specs);
	}

	}
	diff --git a/src/parser/html/PhutilDOMNode.php b/src/parser/html/PhutilDOMNode.php
	new file mode 100644
	--- /dev/null
	+++ b/src/parser/html/PhutilDOMNode.php
	@@ -0,0 +1,193 @@
	+<?php
	+
	+final class PhutilDOMNode extends Phobject {
	+
	+ private $content;
	+ private $tagName;
	+ private $children = array();
	+ private $attributes = array();
	+ private $parentNode;
	+ private $rawString;
	+
	+ public function setContent($content) {
	+ $this->content = $content;
	+ return $this;
	+ }
	+
	+ public function getContent() {
	+ return $this->content;
	+ }
	+
	+ public function isContentNode() {
	+ return ($this->content !== null);
	+ }
	+
	+ public function setTagName($tag_name) {
	+ $this->tagName = $tag_name;
	+ return $this;
	+ }
	+
	+ public function getTagName() {
	+ return $this->tagName;
	+ }
	+
	+ public function appendChild(PhutilDOMNode $node) {
	+ $node->parentNode = $this;
	+ $this->children[] = $node;
	+ return $this;
	+ }
	+
	+ public function getChildren() {
	+ return $this->children;
	+ }
	+
	+ public function getParentNode() {
	+ return $this->parentNode;
	+ }
	+
	+ public function setAttributes(array $attributes) {
	+ $this->attributes = $attributes;
	+ return $this;
	+ }
	+
	+ public function getAttributes() {
	+ return $this->attributes;
	+ }
	+
	+ public function setRawString($raw_string) {
	+ $this->rawString = $raw_string;
	+ return $this;
	+ }
	+
	+ public function getRawString() {
	+ return $this->rawString;
	+ }
	+
	+ public function toDictionary() {
	+ if ($this->isContentNode()) {
	+ return array(
	+ 'content' => $this->content,
	+ );
	+ } else {
	+ $children = array();
	+
	+ foreach ($this->getChildren() as $child) {
	+ $children[] = $child->toDictionary();
	+ }
	+
	+ return array(
	+ 'tag' => $this->getTagName(),
	+ 'attributes' => $this->getAttributes(),
	+ 'children' => $children,
	+ );
	+ }
	+ }
	+
	+ /**
	+ * Get a list of the children of a given DOM node, treating unexpected
	+ * tags as if they were raw content.
	+ */
	+ public function selectChildrenWithTags(array $tag_list) {
	+ $tag_map = array_fuse($tag_list);
	+
	+ $nodes = array();
	+ foreach ($this->getChildren() as $child) {
	+ // If this is already a content node, just keep it as-is.
	+ if ($child->isContentNode()) {
	+ $nodes[] = $child;
	+ continue;
	+ }
	+
	+ $tag_name = $child->getTagName();
	+
	+ // If this is a tag that we're allowing, keep it as-is.
	+ if (isset($tag_map[$tag_name])) {
	+ $nodes[] = $child;
	+ continue;
	+ }
	+
	+ // Otherwise, this is some other tag. Convert it into a content
	+ // node.
	+
	+ $raw_content = $child->getRawString();
	+
	+ $nodes[] = id(new self())
	+ ->setContent($raw_content)
	+ ->setRawContent($raw_content);
	+ }
	+
	+ return $this->mergeContentNodes($nodes);
	+ }
	+
	+ public function getRawContentString() {
	+ $content_node = $this->selectChildrenWithTags(array());
	+
	+ if (!$content_node) {
	+ return '';
	+ }
	+
	+ return head($content_node)->getRawString();
	+ }
	+
	+ public function mergeContent() {
	+ $this->children = $this->mergeContentNodes($this->children);
	+
	+ foreach ($this->getChildren() as $child) {
	+ $child->parentNode = $this;
	+ $child->mergeContent();
	+ }
	+
	+ return $this;
	+ }
	+
	+ /**
	+ * Given a list of nodes, combine sequences of multiple adjacent content
	+ * nodes into single nodes.
	+ */
	+ private function mergeContentNodes(array $nodes) {
	+ $list = array();
	+ $content_block = array();
	+ foreach ($this->getChildren() as $child) {
	+ if ($child->isContentNode()) {
	+ $content_block[] = $child;
	+ continue;
	+ }
	+
	+ $list[] = $content_block;
	+ $content_block = array();
	+
	+ $list[] = $child;
	+ }
	+
	+ $list[] = $content_block;
	+
	+ $results = array();
	+ foreach ($list as $item) {
	+ if (!is_array($item)) {
	+ $results[] = $item;
	+ continue;
	+ }
	+
	+ if (!$item) {
	+ continue;
	+ }
	+
	+ $parts = array();
	+ foreach ($item as $content_node) {
	+ $parts[] = $content_node->getRawString();
	+ }
	+ $parts = implode('', $parts);
	+
	+ if (!strlen($parts)) {
	+ continue;
	+ }
	+
	+ $results[] = id(new self())
	+ ->setContent($parts)
	+ ->setRawString($parts);
	+ }
	+
	+ return $results;
	+ }
	+
	+}
	diff --git a/src/parser/html/PhutilHTMLParser.php b/src/parser/html/PhutilHTMLParser.php
	new file mode 100644
	--- /dev/null
	+++ b/src/parser/html/PhutilHTMLParser.php
	@@ -0,0 +1,426 @@
	+<?php
	+
	+final class PhutilHTMLParser extends Phobject {
	+
	+ private $cursor;
	+
	+ public function parseDocument($corpus) {
	+ // Divide the block into sequences of "tag" and "non-tag" content. Tag
	+ // content is anything between angle brackets ("<" and ">"). Non-tag
	+ // content is anything else.
	+
	+ $segment_pos = 0;
	+ $segments = array();
	+ $in_tag = false;
	+
	+ for ($ii = 0; $ii < strlen($corpus); $ii++) {
	+ $c = $corpus[$ii];
	+
	+ if ($in_tag && ($c === '>')) {
	+ if ($segment_pos !== null) {
	+ $segments[] = array(
	+ 'tag' => $in_tag,
	+ 'pos' => $segment_pos,
	+ 'end' => $ii + 1,
	+ );
	+ }
	+
	+ $segment_pos = $ii + 1;
	+ $in_tag = false;
	+ continue;
	+ }
	+
	+ if (!$in_tag && ($c === '<')) {
	+ $segments[] = array(
	+ 'tag' => $in_tag,
	+ 'pos' => $segment_pos,
	+ 'end' => $ii,
	+ );
	+
	+ $segment_pos = $ii;
	+ $in_tag = true;
	+ continue;
	+ }
	+ }
	+
	+ // Add whatever content was left at the end of the string. If we were in
	+ // a tag but did not find a closing ">", we treat this as normal content.
	+ $segments[] = array(
	+ 'tag' => false,
	+ 'pos' => $segment_pos,
	+ 'end' => $ii,
	+ );
	+
	+ // Slice the marked segments out of the raw corpus so we get a list of
	+ // "tag" strings and a list of "non-tag" strings.
	+
	+ $parts = array();
	+ $corpus_length = strlen($corpus);
	+ foreach ($segments as $segment) {
	+ $tag = $segment['tag'];
	+ $pos = $segment['pos'];
	+ $len = $segment['end'] - $pos;
	+
	+ // If this is a tag, we'll drop the "<" at the beginning and the ">"
	+ // at the end here.
	+ if ($tag) {
	+ $slice_pos = $pos + 1;
	+ $slice_len = $len - 2;
	+ } else {
	+ $slice_pos = $pos;
	+ $slice_len = $len;
	+ }
	+
	+ if (($slice_pos < $corpus_length) && ($slice_len > 0)) {
	+ $content = substr($corpus, $slice_pos, $slice_len);
	+ } else {
	+ $content = '';
	+ }
	+
	+ $parts[] = array(
	+ 'tag' => $tag,
	+ 'pos' => $pos,
	+ 'len' => $len,
	+ 'content' => $content,
	+ );
	+ }
	+
	+ $root = new PhutilDOMNode();
	+ $this->setCursor($root);
	+
	+ foreach ($parts as $part) {
	+ $tag = $this->newTagDOMNode($part);
	+
	+ if ($tag !== null) {
	+ continue;
	+ }
	+
	+ $content = $part['content'];
	+
	+ // If this part is a tag, restore the angle brackets.
	+ if ($part['tag']) {
	+ $content = '<'.$content.'>';
	+ }
	+
	+ $node = id(new PhutilDOMNode())
	+ ->setContent($content)
	+ ->setRawString($content);
	+
	+ $this->getCursor()->appendChild($node);
	+ }
	+
	+ $root->mergeContent();
	+
	+ return $root;
	+ }
	+
	+ private function newTagDOMNode(array $part) {
	+ if (!$part['tag']) {
	+ return null;
	+ }
	+
	+ $raw_content = $part['content'];
	+ $content = $raw_content;
	+
	+ $content = trim($content);
	+ $content_len = strlen($content);
	+
	+ // If the tag content begins with "/", like "</td>", strip the slash
	+ // off and mark this as a closing tag.
	+ $is_close = false;
	+ if ($content_len > 0 && $content[0] === '/') {
	+ $is_close = true;
	+ $content = substr($content, 1);
	+ $content = trim($content);
	+ $content_len = strlen($content);
	+ }
	+
	+ // If the tag content ends with "/", like "<td />", strip the slash off
	+ // and mark this as self-closing.
	+ $self_close = false;
	+ if ($content_len > 0 && $content[$content_len - 1] === '/') {
	+ $self_close = true;
	+ $content = substr($content, 0, $content_len - 1);
	+ $content = trim($content);
	+ $content_len = strlen($content);
	+ }
	+
	+ // If this tag is both a closing tag and a self-closing tag, it is
	+ // not formatted correctly. Treat it as content.
	+ if ($self_close && $is_close) {
	+ return null;
	+ }
	+
	+ // Now, split the rest of the tag into the tag name and tag attributes.
	+ $pieces = preg_split('/\s+/', $content, 2);
	+ $tag_name = $pieces[0];
	+
	+ if (count($pieces) > 1) {
	+ $attributes = $pieces[1];
	+ } else {
	+ $attributes = '';
	+ }
	+
	+ // If there's no tag name, this tag is not valid. Treat it as content.
	+ if (!strlen($tag_name)) {
	+ return null;
	+ }
	+
	+ // If this is a closing tag with attributes, it's not valid. Treat it
	+ // as content.
	+ if ($is_close && strlen($attributes)) {
	+ return null;
	+ }
	+
	+ $tag_name = phutil_utf8_strtolower($tag_name);
	+
	+ // If we find a valid closing tag, try to find a matching tag on the stack.
	+ // If we find a matching tag, close it.
	+ // If we do not find a matching tag, treat the closing tag as content.
	+ if ($is_close) {
	+ $cursor = $this->getCursor();
	+
	+ while ($cursor) {
	+ if ($cursor->getTagName() === $tag_name) {
	+ $parent = $cursor->getParentNode();
	+ $this->setCursor($parent);
	+ return true;
	+ }
	+ $cursor = $cursor->getParentNode();
	+ }
	+
	+ return null;
	+ }
	+
	+ if (strlen($attributes)) {
	+ $attribute_map = $this->parseAttributes($attributes);
	+ // If the attributes can't be parsed, treat the tag as content.
	+ if ($attribute_map === null) {
	+ return null;
	+ }
	+ } else {
	+ $attribute_map = array();
	+ }
	+
	+ $node = id(new PhutilDOMNode())
	+ ->setTagName($tag_name)
	+ ->setAttributes($attribute_map)
	+ ->setRawString('<'.$raw_content.'>');
	+
	+ $cursor = $this->getCursor();
	+ $cursor->appendChild($node);
	+
	+ if (!$self_close) {
	+ $this->setCursor($node);
	+ }
	+
	+ return $node;
	+ }
	+
	+ private function setCursor(PhutilDOMNode $cursor) {
	+ $this->cursor = $cursor;
	+ return $this;
	+ }
	+
	+ private function getCursor() {
	+ return $this->cursor;
	+ }
	+
	+ private function parseAttributes($attributes) {
	+ $state = 'key';
	+
	+ $whitespace = array(
	+ ' ' => true,
	+ "\n" => true,
	+ "\t" => true,
	+ "\r" => true,
	+ );
	+
	+ $map = array();
	+ $len = strlen($attributes);
	+ $key_pos = null;
	+ for ($ii = 0; $ii < $len; $ii++) {
	+ $c = $attributes[$ii];
	+ $is_space = isset($whitespace[$c]);
	+
	+ switch ($state) {
	+ case 'key':
	+ // We're looking for the start of an attribute name.
	+
	+ // Skip over any whitespace.
	+ if ($is_space) {
	+ break;
	+ }
	+
	+ // If we see "<tag =...", that isn't valid. Treat this tag as
	+ // content.
	+ if ($c === '=') {
	+ return null;
	+ }
	+
	+ // If we see a quotation mark with no attribute name, that isn't
	+ // valid. Treat this tag as content.
	+ if ($c === '"') {
	+ return null;
	+ }
	+
	+ // Any other character marks the beginning of an attribute name.
	+ // Switch the parser state to "name" to parse the name.
	+ $name_pos = $ii;
	+ $state = 'name';
	+ break;
	+ case 'name':
	+ // We're looking for the end of an attribute name.
	+
	+ // Finding a "=" or a space character ends the attribute name.
	+ // Save it, then figure out what to do with the parser state.
	+ if ($c === '=' \|\| $is_space) {
	+ $name_value = substr($attributes, $name_pos, $ii - $name_pos);
	+ $name_value = phutil_utf8_strtolower($name_value);
	+
	+ // If this attribute already exists, the tag is invalid. This means
	+ // the input is something like "<tag a=1 a=2>".
	+ if (isset($map[$name_value])) {
	+ return null;
	+ }
	+ }
	+
	+ // If we find an "=", that's the end of the name. Next, we're going
	+ // to parse a value.
	+ if ($c === '=') {
	+ $state = 'value';
	+ break;
	+ }
	+
	+ // If we find whitespace, that's the end of the name. We're going
	+ // to look for an "=".
	+ if ($is_space) {
	+ $state = 'equals';
	+ break;
	+ }
	+
	+ break;
	+ case 'equals':
	+ // We've parsed the name of an attribute and are looking for an
	+ // "=" character.
	+
	+ // Skip over any whitespace.
	+ if ($is_space) {
	+ break;
	+ }
	+
	+ // This is the "=" we're looking for, so we're good to go.
	+ if ($c === '=') {
	+ $state = 'value';
	+ break;
	+ }
	+
	+ // If this is anything else, this is an attribute name with no
	+ // value. Treat it as "true" and move on. This corresponds to an
	+ // input like "<input disabled>".
	+ $map[$name_value] = true;
	+ $name_pos = $ii;
	+ $state = 'name';
	+ break;
	+ case 'value':
	+ // We've parsed an "=" and are looking for the start of a value.
	+
	+ // Skip over any whitespace.
	+ if ($is_space) {
	+ break;
	+ }
	+
	+ // Don't accept "<tag a==" to mean that key "a" has a value of
	+ // "=", since this is silly. To specify a value beginning with "=",
	+ // you have to quote it.
	+ if ($c === '=') {
	+ return null;
	+ }
	+
	+ // Anything else is a value.
	+ $value_pos = $ii;
	+
	+ // This is a quotation mark, so parse a quoted value.
	+ if ($c === '"') {
	+ $value_pos = $value_pos + 1;
	+ $state = 'quoted';
	+ } else {
	+ $state = 'unquoted';
	+ }
	+ break;
	+ case 'quoted':
	+ // We've started parsing a quoted value, so look for the closing
	+ // quote.
	+
	+ // We found the closing quote, so pull out the actual value.
	+ if ($c === '"') {
	+ $attr_value = substr($attributes, $value_pos, $ii - $value_pos);
	+
	+ $map[$name_value] = $attr_value;
	+ $state = 'key';
	+ break;
	+ }
	+
	+ // Anything else is more text in the quoted value.
	+ break;
	+ case 'unquoted':
	+ // We've started parsing an unquoted value, so look for terminating
	+ // whitespace.
	+
	+ // We've found some whitespace, so pull out the actual value.
	+ if ($is_space) {
	+ $attr_value = substr($attributes, $value_pos, $ii - $value_pos);
	+
	+ $map[$name_value] = $attr_value;
	+ $state = 'key';
	+ break;
	+ }
	+
	+ // Anything else is more text in the unquoted value.
	+ break;
	+ }
	+ }
	+
	+ switch ($state) {
	+ case 'key':
	+ // We were looking for the start of an attribute name, so there's
	+ // nothing to clean up.
	+ break;
	+ case 'name':
	+ // We were looking for the end of an attribute name. Treat whatever
	+ // we found as a name.
	+ $name_value = substr($attributes, $name_pos, $len - $name_pos);
	+
	+ if (isset($map[$name_value])) {
	+ return null;
	+ }
	+
	+ $map[$name_value] = true;
	+ break;
	+ case 'equals':
	+ case 'value':
	+ // We found an attribute name followed by whitespace or an "=". Treat
	+ // whatever we found as a valid attribute name with no value.
	+
	+ if (isset($map[$name_value])) {
	+ return null;
	+ }
	+
	+ $map[$name_value] = true;
	+ break;
	+ case 'quoted':
	+ case 'unquoted':
	+ // We were parsing a value but ran out of characters before we found
	+ // the delimiter or closing quote. Treat whatever we found as a quoted
	+ // value.
	+
	+ $attr_value = substr($attributes, $value_pos, $len - $name_pos);
	+
	+ $map[$name_value] = $attr_value;
	+ break;
	+ }
	+
	+ return $map;
	+ }
	+
	+}
	diff --git a/src/parser/html/__tests__/PhutilHTMLParserTestCase.php b/src/parser/html/__tests__/PhutilHTMLParserTestCase.php
	new file mode 100644
	--- /dev/null
	+++ b/src/parser/html/__tests__/PhutilHTMLParserTestCase.php
	@@ -0,0 +1,45 @@
	+<?php
	+
	+final class PhutilHTMLParserTestCase
	+ extends PhutilTestCase {
	+
	+ public function testHTMLParser() {
	+
	+ $root = dirname(__FILE__).'/data/';
	+ $tests = Filesystem::listDirectory($root, $include_hidden = false);
	+
	+ foreach ($tests as $test) {
	+ $path = $root.$test;
	+ $data = Filesystem::readFile($path);
	+
	+ $parts = explode("\n~~~~~~~~~~\n", $data);
	+ if (count($parts) !== 2) {
	+ throw new Exception(
	+ pht(
	+ 'Expected "~~~~~~~~~~" delimiter in test "%s".',
	+ $test));
	+ }
	+
	+ $input = $parts[0];
	+
	+ $expect = $parts[1];
	+ $expect = phutil_json_decode($parts[1]);
	+
	+ $document = id(new PhutilHTMLParser())
	+ ->parseDocument($input);
	+
	+ // We're just testing the child list of the root node since this
	+ // reduces the amount of boilerplate in the test cases.
	+ $list = array();
	+ foreach ($document->getChildren() as $child) {
	+ $list[] = $child->toDictionary();
	+ }
	+
	+ $this->assertEqual(
	+ $expect,
	+ $list,
	+ pht('DOM tree for "%s".', $test));
	+ }
	+ }
	+
	+}
	diff --git a/src/parser/html/__tests__/data/attributes-basic.txt b/src/parser/html/__tests__/data/attributes-basic.txt
	new file mode 100644
	--- /dev/null
	+++ b/src/parser/html/__tests__/data/attributes-basic.txt
	@@ -0,0 +1,13 @@
	+<a b=1 c d="e" />
	+~~~~~~~~~~
	+[
	+ {
	+ "tag": "a",
	+ "attributes": {
	+ "b": "1",
	+ "c": true,
	+ "d": "e"
	+ },
	+ "children": []
	+ }
	+]
	diff --git a/src/parser/html/__tests__/data/content-angle.txt b/src/parser/html/__tests__/data/content-angle.txt
	new file mode 100644
	--- /dev/null
	+++ b/src/parser/html/__tests__/data/content-angle.txt
	@@ -0,0 +1,7 @@
	+o< quack
	+~~~~~~~~~~
	+[
	+ {
	+ "content": "o< quack"
	+ }
	+]
	diff --git a/src/parser/html/__tests__/data/content-simple.txt b/src/parser/html/__tests__/data/content-simple.txt
	new file mode 100644
	--- /dev/null
	+++ b/src/parser/html/__tests__/data/content-simple.txt
	@@ -0,0 +1,7 @@
	+quack
	+~~~~~~~~~~
	+[
	+ {
	+ "content": "quack"
	+ }
	+]
	diff --git a/src/parser/html/__tests__/data/tag-mismatch.txt b/src/parser/html/__tests__/data/tag-mismatch.txt
	new file mode 100644
	--- /dev/null
	+++ b/src/parser/html/__tests__/data/tag-mismatch.txt
	@@ -0,0 +1,21 @@
	+<a><b><c></b></a>
	+~~~~~~~~~~
	+[
	+ {
	+ "tag": "a",
	+ "attributes": {},
	+ "children": [
	+ {
	+ "tag": "b",
	+ "attributes": {},
	+ "children": [
	+ {
	+ "tag": "c",
	+ "attributes": {},
	+ "children": []
	+ }
	+ ]
	+ }
	+ ]
	+ }
	+]
	diff --git a/src/parser/html/__tests__/data/tag-simple.txt b/src/parser/html/__tests__/data/tag-simple.txt
	new file mode 100644
	--- /dev/null
	+++ b/src/parser/html/__tests__/data/tag-simple.txt
	@@ -0,0 +1,135 @@
	+<a/>
	+<a/ >
	+< a/>
	+<a />
	+<a / >
	+< a />
	+< a / >
	+<a></a>
	+<a ></a >
	+< a>< /a>
	+< a >< /a >
	+<a></ a>
	+<a ></ a >
	+< a>< / a>
	+< a >< / a >
	+~~~~~~~~~~
	+[
	+ {
	+ "tag": "a",
	+ "attributes": {},
	+ "children": []
	+ },
	+ {
	+ "content": "\n"
	+ },
	+ {
	+ "tag": "a",
	+ "attributes": {},
	+ "children": []
	+ },
	+ {
	+ "content": "\n"
	+ },
	+ {
	+ "tag": "a",
	+ "attributes": {},
	+ "children": []
	+ },
	+ {
	+ "content": "\n"
	+ },
	+ {
	+ "tag": "a",
	+ "attributes": {},
	+ "children": []
	+ },
	+ {
	+ "content": "\n"
	+ },
	+ {
	+ "tag": "a",
	+ "attributes": {},
	+ "children": []
	+ },
	+ {
	+ "content": "\n"
	+ },
	+ {
	+ "tag": "a",
	+ "attributes": {},
	+ "children": []
	+ },
	+ {
	+ "content": "\n"
	+ },
	+ {
	+ "tag": "a",
	+ "attributes": {},
	+ "children": []
	+ },
	+ {
	+ "content": "\n"
	+ },
	+ {
	+ "tag": "a",
	+ "attributes": {},
	+ "children": []
	+ },
	+ {
	+ "content": "\n"
	+ },
	+ {
	+ "tag": "a",
	+ "attributes": {},
	+ "children": []
	+ },
	+ {
	+ "content": "\n"
	+ },
	+ {
	+ "tag": "a",
	+ "attributes": {},
	+ "children": []
	+ },
	+ {
	+ "content": "\n"
	+ },
	+ {
	+ "tag": "a",
	+ "attributes": {},
	+ "children": []
	+ },
	+ {
	+ "content": "\n"
	+ },
	+ {
	+ "tag": "a",
	+ "attributes": {},
	+ "children": []
	+ },
	+ {
	+ "content": "\n"
	+ },
	+ {
	+ "tag": "a",
	+ "attributes": {},
	+ "children": []
	+ },
	+ {
	+ "content": "\n"
	+ },
	+ {
	+ "tag": "a",
	+ "attributes": {},
	+ "children": []
	+ },
	+ {
	+ "content": "\n"
	+ },
	+ {
	+ "tag": "a",
	+ "attributes": {},
	+ "children": []
	+ }
	+]
	diff --git a/src/parser/html/__tests__/data/tag-table.txt b/src/parser/html/__tests__/data/tag-table.txt
	new file mode 100644
	--- /dev/null
	+++ b/src/parser/html/__tests__/data/tag-table.txt
	@@ -0,0 +1,39 @@
	+<table><tr><td>a</td><td>b</td><td /></tr></table>
	+~~~~~~~~~~
	+[
	+ {
	+ "tag": "table",
	+ "attributes": {},
	+ "children": [
	+ {
	+ "tag": "tr",
	+ "attributes": {},
	+ "children": [
	+ {
	+ "tag": "td",
	+ "attributes": {},
	+ "children": [
	+ {
	+ "content": "a"
	+ }
	+ ]
	+ },
	+ {
	+ "tag": "td",
	+ "attributes": {},
	+ "children": [
	+ {
	+ "content": "b"
	+ }
	+ ]
	+ },
	+ {
	+ "tag": "td",
	+ "attributes": {},
	+ "children": []
	+ }
	+ ]
	+ }
	+ ]
	+ }
	+]

File Metadata

Mime Type: text/plain
Expires: Mar 22 2025, 7:16 AM (4 w, 5 d ago)
Storage Engine: blob
Storage Format: Encrypted (AES-256-CBC)
Storage Handle: 7382398
Default Alt Text: D20568.id49067.diff (29 KB)

D20568.id49067.diffNo OneTemporaryActions

D20568.id49067.diffView Options

File Metadata

Event Timeline

D20568.id49067.diff
No OneTemporary
Actions

D20568.id49067.diff
View Options