Changeset View
Changeset View
Standalone View
Standalone View
src/parser/html/PhutilHTMLParser.php
- This file was added.
<?php | |||||
final class PhutilHTMLParser extends Phobject { | |||||
private $cursor; | |||||
public function parseDocument($corpus) { | |||||
// Divide the block into sequences of "tag" and "non-tag" content. Tag | |||||
// content is anything between angle brackets ("<" and ">"). Non-tag | |||||
// content is anything else. | |||||
$segment_pos = 0; | |||||
$segments = array(); | |||||
$in_tag = false; | |||||
for ($ii = 0; $ii < strlen($corpus); $ii++) { | |||||
$c = $corpus[$ii]; | |||||
if ($in_tag && ($c === '>')) { | |||||
if ($segment_pos !== null) { | |||||
$segments[] = array( | |||||
'tag' => $in_tag, | |||||
'pos' => $segment_pos, | |||||
'end' => $ii + 1, | |||||
); | |||||
} | |||||
$segment_pos = $ii + 1; | |||||
$in_tag = false; | |||||
continue; | |||||
} | |||||
if (!$in_tag && ($c === '<')) { | |||||
$segments[] = array( | |||||
'tag' => $in_tag, | |||||
'pos' => $segment_pos, | |||||
'end' => $ii, | |||||
); | |||||
$segment_pos = $ii; | |||||
$in_tag = true; | |||||
continue; | |||||
} | |||||
} | |||||
// Add whatever content was left at the end of the string. If we were in | |||||
// a tag but did not find a closing ">", we treat this as normal content. | |||||
$segments[] = array( | |||||
'tag' => false, | |||||
'pos' => $segment_pos, | |||||
'end' => $ii, | |||||
); | |||||
// Slice the marked segments out of the raw corpus so we get a list of | |||||
// "tag" strings and a list of "non-tag" strings. | |||||
$parts = array(); | |||||
$corpus_length = strlen($corpus); | |||||
foreach ($segments as $segment) { | |||||
$tag = $segment['tag']; | |||||
$pos = $segment['pos']; | |||||
$len = $segment['end'] - $pos; | |||||
// If this is a tag, we'll drop the "<" at the beginning and the ">" | |||||
// at the end here. | |||||
if ($tag) { | |||||
$slice_pos = $pos + 1; | |||||
$slice_len = $len - 2; | |||||
} else { | |||||
$slice_pos = $pos; | |||||
$slice_len = $len; | |||||
} | |||||
if (($slice_pos < $corpus_length) && ($slice_len > 0)) { | |||||
$content = substr($corpus, $slice_pos, $slice_len); | |||||
} else { | |||||
$content = ''; | |||||
} | |||||
$parts[] = array( | |||||
'tag' => $tag, | |||||
'pos' => $pos, | |||||
'len' => $len, | |||||
'content' => $content, | |||||
); | |||||
} | |||||
$root = new PhutilDOMNode(); | |||||
$this->setCursor($root); | |||||
foreach ($parts as $part) { | |||||
$tag = $this->newTagDOMNode($part); | |||||
if ($tag !== null) { | |||||
continue; | |||||
} | |||||
$content = $part['content']; | |||||
// If this part is a tag, restore the angle brackets. | |||||
if ($part['tag']) { | |||||
$content = '<'.$content.'>'; | |||||
} | |||||
$node = id(new PhutilDOMNode()) | |||||
->setContent($content) | |||||
->setRawString($content); | |||||
$this->getCursor()->appendChild($node); | |||||
} | |||||
$root->mergeContent(); | |||||
return $root; | |||||
} | |||||
private function newTagDOMNode(array $part) { | |||||
if (!$part['tag']) { | |||||
return null; | |||||
} | |||||
$raw_content = $part['content']; | |||||
$content = $raw_content; | |||||
$content = trim($content); | |||||
$content_len = strlen($content); | |||||
// If the tag content begins with "/", like "</td>", strip the slash | |||||
// off and mark this as a closing tag. | |||||
$is_close = false; | |||||
if ($content_len > 0 && $content[0] === '/') { | |||||
$is_close = true; | |||||
$content = substr($content, 1); | |||||
$content = trim($content); | |||||
$content_len = strlen($content); | |||||
} | |||||
// If the tag content ends with "/", like "<td />", strip the slash off | |||||
// and mark this as self-closing. | |||||
$self_close = false; | |||||
if ($content_len > 0 && $content[$content_len - 1] === '/') { | |||||
$self_close = true; | |||||
$content = substr($content, 0, $content_len - 1); | |||||
$content = trim($content); | |||||
$content_len = strlen($content); | |||||
} | |||||
// If this tag is both a closing tag and a self-closing tag, it is | |||||
// not formatted correctly. Treat it as content. | |||||
if ($self_close && $is_close) { | |||||
return null; | |||||
} | |||||
// Now, split the rest of the tag into the tag name and tag attributes. | |||||
$pieces = preg_split('/\s+/', $content, 2); | |||||
$tag_name = $pieces[0]; | |||||
if (count($pieces) > 1) { | |||||
$attributes = $pieces[1]; | |||||
} else { | |||||
$attributes = ''; | |||||
} | |||||
// If there's no tag name, this tag is not valid. Treat it as content. | |||||
if (!strlen($tag_name)) { | |||||
return null; | |||||
} | |||||
// If this is a closing tag with attributes, it's not valid. Treat it | |||||
// as content. | |||||
if ($is_close && strlen($attributes)) { | |||||
return null; | |||||
} | |||||
$tag_name = phutil_utf8_strtolower($tag_name); | |||||
// If we find a valid closing tag, try to find a matching tag on the stack. | |||||
// If we find a matching tag, close it. | |||||
// If we do not find a matching tag, treat the closing tag as content. | |||||
if ($is_close) { | |||||
$cursor = $this->getCursor(); | |||||
while ($cursor) { | |||||
if ($cursor->getTagName() === $tag_name) { | |||||
$parent = $cursor->getParentNode(); | |||||
$this->setCursor($parent); | |||||
return true; | |||||
} | |||||
$cursor = $cursor->getParentNode(); | |||||
} | |||||
return null; | |||||
} | |||||
if (strlen($attributes)) { | |||||
$attribute_map = $this->parseAttributes($attributes); | |||||
// If the attributes can't be parsed, treat the tag as content. | |||||
if ($attribute_map === null) { | |||||
return null; | |||||
} | |||||
} else { | |||||
$attribute_map = array(); | |||||
} | |||||
$node = id(new PhutilDOMNode()) | |||||
->setTagName($tag_name) | |||||
->setAttributes($attribute_map) | |||||
->setRawString('<'.$raw_content.'>'); | |||||
$cursor = $this->getCursor(); | |||||
$cursor->appendChild($node); | |||||
if (!$self_close) { | |||||
$this->setCursor($node); | |||||
} | |||||
return $node; | |||||
} | |||||
private function setCursor(PhutilDOMNode $cursor) { | |||||
$this->cursor = $cursor; | |||||
return $this; | |||||
} | |||||
private function getCursor() { | |||||
return $this->cursor; | |||||
} | |||||
private function parseAttributes($attributes) { | |||||
$state = 'key'; | |||||
$whitespace = array( | |||||
' ' => true, | |||||
"\n" => true, | |||||
"\t" => true, | |||||
"\r" => true, | |||||
); | |||||
$map = array(); | |||||
$len = strlen($attributes); | |||||
$key_pos = null; | |||||
for ($ii = 0; $ii < $len; $ii++) { | |||||
$c = $attributes[$ii]; | |||||
$is_space = isset($whitespace[$c]); | |||||
switch ($state) { | |||||
case 'key': | |||||
// We're looking for the start of an attribute name. | |||||
// Skip over any whitespace. | |||||
if ($is_space) { | |||||
break; | |||||
} | |||||
// If we see "<tag =...", that isn't valid. Treat this tag as | |||||
// content. | |||||
if ($c === '=') { | |||||
return null; | |||||
} | |||||
// If we see a quotation mark with no attribute name, that isn't | |||||
// valid. Treat this tag as content. | |||||
if ($c === '"') { | |||||
return null; | |||||
} | |||||
// Any other character marks the beginning of an attribute name. | |||||
// Switch the parser state to "name" to parse the name. | |||||
$name_pos = $ii; | |||||
$state = 'name'; | |||||
break; | |||||
case 'name': | |||||
// We're looking for the end of an attribute name. | |||||
// Finding a "=" or a space character ends the attribute name. | |||||
// Save it, then figure out what to do with the parser state. | |||||
if ($c === '=' || $is_space) { | |||||
$name_value = substr($attributes, $name_pos, $ii - $name_pos); | |||||
$name_value = phutil_utf8_strtolower($name_value); | |||||
// If this attribute already exists, the tag is invalid. This means | |||||
// the input is something like "<tag a=1 a=2>". | |||||
if (isset($map[$name_value])) { | |||||
return null; | |||||
} | |||||
} | |||||
// If we find an "=", that's the end of the name. Next, we're going | |||||
// to parse a value. | |||||
if ($c === '=') { | |||||
$state = 'value'; | |||||
break; | |||||
} | |||||
// If we find whitespace, that's the end of the name. We're going | |||||
// to look for an "=". | |||||
if ($is_space) { | |||||
$state = 'equals'; | |||||
break; | |||||
} | |||||
break; | |||||
case 'equals': | |||||
// We've parsed the name of an attribute and are looking for an | |||||
// "=" character. | |||||
// Skip over any whitespace. | |||||
if ($is_space) { | |||||
break; | |||||
} | |||||
// This is the "=" we're looking for, so we're good to go. | |||||
if ($c === '=') { | |||||
$state = 'value'; | |||||
break; | |||||
} | |||||
// If this is anything else, this is an attribute name with no | |||||
// value. Treat it as "true" and move on. This corresponds to an | |||||
// input like "<input disabled>". | |||||
$map[$name_value] = true; | |||||
$name_pos = $ii; | |||||
$state = 'name'; | |||||
break; | |||||
case 'value': | |||||
// We've parsed an "=" and are looking for the start of a value. | |||||
// Skip over any whitespace. | |||||
if ($is_space) { | |||||
break; | |||||
} | |||||
// Don't accept "<tag a==" to mean that key "a" has a value of | |||||
// "=", since this is silly. To specify a value beginning with "=", | |||||
// you have to quote it. | |||||
if ($c === '=') { | |||||
return null; | |||||
} | |||||
// Anything else is a value. | |||||
$value_pos = $ii; | |||||
// This is a quotation mark, so parse a quoted value. | |||||
if ($c === '"') { | |||||
$value_pos = $value_pos + 1; | |||||
$state = 'quoted'; | |||||
} else { | |||||
$state = 'unquoted'; | |||||
} | |||||
break; | |||||
case 'quoted': | |||||
// We've started parsing a quoted value, so look for the closing | |||||
// quote. | |||||
// We found the closing quote, so pull out the actual value. | |||||
if ($c === '"') { | |||||
$attr_value = substr($attributes, $value_pos, $ii - $value_pos); | |||||
$map[$name_value] = $attr_value; | |||||
$state = 'key'; | |||||
break; | |||||
} | |||||
// Anything else is more text in the quoted value. | |||||
break; | |||||
case 'unquoted': | |||||
// We've started parsing an unquoted value, so look for terminating | |||||
// whitespace. | |||||
// We've found some whitespace, so pull out the actual value. | |||||
if ($is_space) { | |||||
$attr_value = substr($attributes, $value_pos, $ii - $value_pos); | |||||
$map[$name_value] = $attr_value; | |||||
$state = 'key'; | |||||
break; | |||||
} | |||||
// Anything else is more text in the unquoted value. | |||||
break; | |||||
} | |||||
} | |||||
switch ($state) { | |||||
case 'key': | |||||
// We were looking for the start of an attribute name, so there's | |||||
// nothing to clean up. | |||||
break; | |||||
case 'name': | |||||
// We were looking for the end of an attribute name. Treat whatever | |||||
// we found as a name. | |||||
$name_value = substr($attributes, $name_pos, $len - $name_pos); | |||||
if (isset($map[$name_value])) { | |||||
return null; | |||||
} | |||||
$map[$name_value] = true; | |||||
break; | |||||
case 'equals': | |||||
case 'value': | |||||
// We found an attribute name followed by whitespace or an "=". Treat | |||||
// whatever we found as a valid attribute name with no value. | |||||
if (isset($map[$name_value])) { | |||||
return null; | |||||
} | |||||
$map[$name_value] = true; | |||||
break; | |||||
case 'quoted': | |||||
case 'unquoted': | |||||
// We were parsing a value but ran out of characters before we found | |||||
// the delimiter or closing quote. Treat whatever we found as a quoted | |||||
// value. | |||||
$attr_value = substr($attributes, $value_pos, $len - $name_pos); | |||||
$map[$name_value] = $attr_value; | |||||
break; | |||||
} | |||||
return $map; | |||||
} | |||||
} |