diff --git a/src/parser/html/PhutilHTMLParser.php b/src/parser/html/PhutilHTMLParser.php
index a6e3d63..66e82a5 100644
--- a/src/parser/html/PhutilHTMLParser.php
+++ b/src/parser/html/PhutilHTMLParser.php
@@ -1,430 +1,434 @@
"). Non-tag
// content is anything else.
$segment_pos = 0;
$segments = array();
$in_tag = false;
for ($ii = 0; $ii < strlen($corpus); $ii++) {
$c = $corpus[$ii];
if ($in_tag && ($c === '>')) {
if ($segment_pos !== null) {
$segments[] = array(
'tag' => $in_tag,
'pos' => $segment_pos,
'end' => $ii + 1,
);
}
$segment_pos = $ii + 1;
$in_tag = false;
continue;
}
- if (!$in_tag && ($c === '<')) {
+ // When we encounter a "<", we start a new tag whether we're already in
+ // a tag or not. We want to parse "1 < 2" as a single tag with
+ // the content "1 < 2".
+
+ if ($c === '<') {
$segments[] = array(
- 'tag' => $in_tag,
+ 'tag' => false,
'pos' => $segment_pos,
'end' => $ii,
);
$segment_pos = $ii;
$in_tag = true;
continue;
}
}
// Add whatever content was left at the end of the string. If we were in
// a tag but did not find a closing ">", we treat this as normal content.
$segments[] = array(
'tag' => false,
'pos' => $segment_pos,
'end' => $ii,
);
// Slice the marked segments out of the raw corpus so we get a list of
// "tag" strings and a list of "non-tag" strings.
$parts = array();
$corpus_length = strlen($corpus);
foreach ($segments as $segment) {
$tag = $segment['tag'];
$pos = $segment['pos'];
$len = $segment['end'] - $pos;
// If this is a tag, we'll drop the "<" at the beginning and the ">"
// at the end here.
if ($tag) {
$slice_pos = $pos + 1;
$slice_len = $len - 2;
} else {
$slice_pos = $pos;
$slice_len = $len;
}
if (($slice_pos < $corpus_length) && ($slice_len > 0)) {
$content = substr($corpus, $slice_pos, $slice_len);
} else {
$content = '';
}
$parts[] = array(
'tag' => $tag,
'pos' => $pos,
'len' => $len,
'content' => $content,
);
}
$root = new PhutilDOMNode();
$this->setCursor($root);
foreach ($parts as $part) {
$tag = $this->newTagDOMNode($part);
if ($tag !== null) {
continue;
}
$content = $part['content'];
// If this part is a tag, restore the angle brackets.
if ($part['tag']) {
$content = '<'.$content.'>';
}
$node = id(new PhutilDOMNode())
->setContent($content)
->setRawHead($content);
$this->getCursor()->appendChild($node);
}
$root->mergeContent();
return $root;
}
private function newTagDOMNode(array $part) {
if (!$part['tag']) {
return null;
}
$raw_content = $part['content'];
$content = $raw_content;
$content = trim($content);
$content_len = strlen($content);
// If the tag content begins with "/", like "", strip the slash
// off and mark this as a closing tag.
$is_close = false;
if ($content_len > 0 && $content[0] === '/') {
$is_close = true;
$content = substr($content, 1);
$content = trim($content);
$content_len = strlen($content);
}
// If the tag content ends with "/", like "
", strip the slash off
// and mark this as self-closing.
$self_close = false;
if ($content_len > 0 && $content[$content_len - 1] === '/') {
$self_close = true;
$content = substr($content, 0, $content_len - 1);
$content = trim($content);
$content_len = strlen($content);
}
// If this tag is both a closing tag and a self-closing tag, it is
// not formatted correctly. Treat it as content.
if ($self_close && $is_close) {
return null;
}
// Now, split the rest of the tag into the tag name and tag attributes.
$pieces = preg_split('/\s+/', $content, 2);
$tag_name = $pieces[0];
if (count($pieces) > 1) {
$attributes = $pieces[1];
} else {
$attributes = '';
}
// If there's no tag name, this tag is not valid. Treat it as content.
if (!strlen($tag_name)) {
return null;
}
// If this is a closing tag with attributes, it's not valid. Treat it
// as content.
if ($is_close && strlen($attributes)) {
return null;
}
$tag_name = phutil_utf8_strtolower($tag_name);
// If we find a valid closing tag, try to find a matching tag on the stack.
// If we find a matching tag, close it.
// If we do not find a matching tag, treat the closing tag as content.
if ($is_close) {
$cursor = $this->getCursor();
while ($cursor) {
if ($cursor->getTagName() === $tag_name) {
// Add this raw content to the raw content of the tag we're closing.
$cursor->setRawTail('<'.$raw_content.'>');
$parent = $cursor->getParentNode();
$this->setCursor($parent);
return true;
}
$cursor = $cursor->getParentNode();
}
return null;
}
if (strlen($attributes)) {
$attribute_map = $this->parseAttributes($attributes);
// If the attributes can't be parsed, treat the tag as content.
if ($attribute_map === null) {
return null;
}
} else {
$attribute_map = array();
}
$node = id(new PhutilDOMNode())
->setTagName($tag_name)
->setAttributes($attribute_map)
->setRawHead('<'.$raw_content.'>');
$cursor = $this->getCursor();
$cursor->appendChild($node);
if (!$self_close) {
$this->setCursor($node);
}
return $node;
}
private function setCursor(PhutilDOMNode $cursor) {
$this->cursor = $cursor;
return $this;
}
private function getCursor() {
return $this->cursor;
}
private function parseAttributes($attributes) {
$state = 'key';
$whitespace = array(
' ' => true,
"\n" => true,
"\t" => true,
"\r" => true,
);
$map = array();
$len = strlen($attributes);
$key_pos = null;
for ($ii = 0; $ii < $len; $ii++) {
$c = $attributes[$ii];
$is_space = isset($whitespace[$c]);
switch ($state) {
case 'key':
// We're looking for the start of an attribute name.
// Skip over any whitespace.
if ($is_space) {
break;
}
// If we see "".
if (isset($map[$name_value])) {
return null;
}
}
// If we find an "=", that's the end of the name. Next, we're going
// to parse a value.
if ($c === '=') {
$state = 'value';
break;
}
// If we find whitespace, that's the end of the name. We're going
// to look for an "=".
if ($is_space) {
$state = 'equals';
break;
}
break;
case 'equals':
// We've parsed the name of an attribute and are looking for an
// "=" character.
// Skip over any whitespace.
if ($is_space) {
break;
}
// This is the "=" we're looking for, so we're good to go.
if ($c === '=') {
$state = 'value';
break;
}
// If this is anything else, this is an attribute name with no
// value. Treat it as "true" and move on. This corresponds to an
// input like "".
$map[$name_value] = true;
$name_pos = $ii;
$state = 'name';
break;
case 'value':
// We've parsed an "=" and are looking for the start of a value.
// Skip over any whitespace.
if ($is_space) {
break;
}
// Don't accept "1 < 2
+~~~~~~~~~~
+[
+ {
+ "tag": "math",
+ "attributes": {},
+ "children": [
+ {
+ "content": "1 < 2"
+ }
+ ]
+ }
+]