diff --git a/src/parser/html/PhutilHTMLParser.php b/src/parser/html/PhutilHTMLParser.php index a6e3d63..66e82a5 100644 --- a/src/parser/html/PhutilHTMLParser.php +++ b/src/parser/html/PhutilHTMLParser.php @@ -1,430 +1,434 @@ "). Non-tag // content is anything else. $segment_pos = 0; $segments = array(); $in_tag = false; for ($ii = 0; $ii < strlen($corpus); $ii++) { $c = $corpus[$ii]; if ($in_tag && ($c === '>')) { if ($segment_pos !== null) { $segments[] = array( 'tag' => $in_tag, 'pos' => $segment_pos, 'end' => $ii + 1, ); } $segment_pos = $ii + 1; $in_tag = false; continue; } - if (!$in_tag && ($c === '<')) { + // When we encounter a "<", we start a new tag whether we're already in + // a tag or not. We want to parse "1 < 2" as a single tag with + // the content "1 < 2". + + if ($c === '<') { $segments[] = array( - 'tag' => $in_tag, + 'tag' => false, 'pos' => $segment_pos, 'end' => $ii, ); $segment_pos = $ii; $in_tag = true; continue; } } // Add whatever content was left at the end of the string. If we were in // a tag but did not find a closing ">", we treat this as normal content. $segments[] = array( 'tag' => false, 'pos' => $segment_pos, 'end' => $ii, ); // Slice the marked segments out of the raw corpus so we get a list of // "tag" strings and a list of "non-tag" strings. $parts = array(); $corpus_length = strlen($corpus); foreach ($segments as $segment) { $tag = $segment['tag']; $pos = $segment['pos']; $len = $segment['end'] - $pos; // If this is a tag, we'll drop the "<" at the beginning and the ">" // at the end here. if ($tag) { $slice_pos = $pos + 1; $slice_len = $len - 2; } else { $slice_pos = $pos; $slice_len = $len; } if (($slice_pos < $corpus_length) && ($slice_len > 0)) { $content = substr($corpus, $slice_pos, $slice_len); } else { $content = ''; } $parts[] = array( 'tag' => $tag, 'pos' => $pos, 'len' => $len, 'content' => $content, ); } $root = new PhutilDOMNode(); $this->setCursor($root); foreach ($parts as $part) { $tag = $this->newTagDOMNode($part); if ($tag !== null) { continue; } $content = $part['content']; // If this part is a tag, restore the angle brackets. if ($part['tag']) { $content = '<'.$content.'>'; } $node = id(new PhutilDOMNode()) ->setContent($content) ->setRawHead($content); $this->getCursor()->appendChild($node); } $root->mergeContent(); return $root; } private function newTagDOMNode(array $part) { if (!$part['tag']) { return null; } $raw_content = $part['content']; $content = $raw_content; $content = trim($content); $content_len = strlen($content); // If the tag content begins with "/", like "", strip the slash // off and mark this as a closing tag. $is_close = false; if ($content_len > 0 && $content[0] === '/') { $is_close = true; $content = substr($content, 1); $content = trim($content); $content_len = strlen($content); } // If the tag content ends with "/", like "", strip the slash off // and mark this as self-closing. $self_close = false; if ($content_len > 0 && $content[$content_len - 1] === '/') { $self_close = true; $content = substr($content, 0, $content_len - 1); $content = trim($content); $content_len = strlen($content); } // If this tag is both a closing tag and a self-closing tag, it is // not formatted correctly. Treat it as content. if ($self_close && $is_close) { return null; } // Now, split the rest of the tag into the tag name and tag attributes. $pieces = preg_split('/\s+/', $content, 2); $tag_name = $pieces[0]; if (count($pieces) > 1) { $attributes = $pieces[1]; } else { $attributes = ''; } // If there's no tag name, this tag is not valid. Treat it as content. if (!strlen($tag_name)) { return null; } // If this is a closing tag with attributes, it's not valid. Treat it // as content. if ($is_close && strlen($attributes)) { return null; } $tag_name = phutil_utf8_strtolower($tag_name); // If we find a valid closing tag, try to find a matching tag on the stack. // If we find a matching tag, close it. // If we do not find a matching tag, treat the closing tag as content. if ($is_close) { $cursor = $this->getCursor(); while ($cursor) { if ($cursor->getTagName() === $tag_name) { // Add this raw content to the raw content of the tag we're closing. $cursor->setRawTail('<'.$raw_content.'>'); $parent = $cursor->getParentNode(); $this->setCursor($parent); return true; } $cursor = $cursor->getParentNode(); } return null; } if (strlen($attributes)) { $attribute_map = $this->parseAttributes($attributes); // If the attributes can't be parsed, treat the tag as content. if ($attribute_map === null) { return null; } } else { $attribute_map = array(); } $node = id(new PhutilDOMNode()) ->setTagName($tag_name) ->setAttributes($attribute_map) ->setRawHead('<'.$raw_content.'>'); $cursor = $this->getCursor(); $cursor->appendChild($node); if (!$self_close) { $this->setCursor($node); } return $node; } private function setCursor(PhutilDOMNode $cursor) { $this->cursor = $cursor; return $this; } private function getCursor() { return $this->cursor; } private function parseAttributes($attributes) { $state = 'key'; $whitespace = array( ' ' => true, "\n" => true, "\t" => true, "\r" => true, ); $map = array(); $len = strlen($attributes); $key_pos = null; for ($ii = 0; $ii < $len; $ii++) { $c = $attributes[$ii]; $is_space = isset($whitespace[$c]); switch ($state) { case 'key': // We're looking for the start of an attribute name. // Skip over any whitespace. if ($is_space) { break; } // If we see "". if (isset($map[$name_value])) { return null; } } // If we find an "=", that's the end of the name. Next, we're going // to parse a value. if ($c === '=') { $state = 'value'; break; } // If we find whitespace, that's the end of the name. We're going // to look for an "=". if ($is_space) { $state = 'equals'; break; } break; case 'equals': // We've parsed the name of an attribute and are looking for an // "=" character. // Skip over any whitespace. if ($is_space) { break; } // This is the "=" we're looking for, so we're good to go. if ($c === '=') { $state = 'value'; break; } // If this is anything else, this is an attribute name with no // value. Treat it as "true" and move on. This corresponds to an // input like "". $map[$name_value] = true; $name_pos = $ii; $state = 'name'; break; case 'value': // We've parsed an "=" and are looking for the start of a value. // Skip over any whitespace. if ($is_space) { break; } // Don't accept "