diff --git a/src/parser/html/PhutilDOMNode.php b/src/parser/html/PhutilDOMNode.php index 5b87ae3..84781c7 100644 --- a/src/parser/html/PhutilDOMNode.php +++ b/src/parser/html/PhutilDOMNode.php @@ -1,193 +1,208 @@ content = $content; return $this; } public function getContent() { return $this->content; } public function isContentNode() { return ($this->content !== null); } public function setTagName($tag_name) { $this->tagName = $tag_name; return $this; } public function getTagName() { return $this->tagName; } public function appendChild(PhutilDOMNode $node) { $node->parentNode = $this; $this->children[] = $node; return $this; } public function getChildren() { return $this->children; } public function getParentNode() { return $this->parentNode; } public function setAttributes(array $attributes) { $this->attributes = $attributes; return $this; } public function getAttributes() { return $this->attributes; } - public function setRawString($raw_string) { - $this->rawString = $raw_string; + public function setRawHead($raw_string) { + $this->rawHead = $raw_string; + return $this; + } + + public function setRawTail($raw_tail) { + $this->rawTail = $raw_tail; return $this; } public function getRawString() { - return $this->rawString; + $raw = array(); + $raw[] = $this->rawHead; + + foreach ($this->getChildren() as $child) { + $raw[] = $child->getRawString(); + } + + $raw[] = $this->rawTail; + + return implode('', $raw); } public function toDictionary() { if ($this->isContentNode()) { return array( 'content' => $this->content, ); } else { $children = array(); foreach ($this->getChildren() as $child) { $children[] = $child->toDictionary(); } return array( 'tag' => $this->getTagName(), 'attributes' => $this->getAttributes(), 'children' => $children, ); } } /** * Get a list of the children of a given DOM node, treating unexpected * tags as if they were raw content. */ public function selectChildrenWithTags(array $tag_list) { $tag_map = array_fuse($tag_list); $nodes = array(); foreach ($this->getChildren() as $child) { // If this is already a content node, just keep it as-is. if ($child->isContentNode()) { $nodes[] = $child; continue; } $tag_name = $child->getTagName(); // If this is a tag that we're allowing, keep it as-is. if (isset($tag_map[$tag_name])) { $nodes[] = $child; continue; } // Otherwise, this is some other tag. Convert it into a content // node. - $raw_content = $child->getRawString(); + $raw_string = $child->getRawString(); $nodes[] = id(new self()) - ->setContent($raw_content) - ->setRawContent($raw_content); + ->setContent($raw_string) + ->setRawHead($raw_string); } return $this->mergeContentNodes($nodes); } public function getRawContentString() { $content_node = $this->selectChildrenWithTags(array()); if (!$content_node) { return ''; } return head($content_node)->getRawString(); } public function mergeContent() { $this->children = $this->mergeContentNodes($this->children); foreach ($this->getChildren() as $child) { $child->parentNode = $this; $child->mergeContent(); } return $this; } /** * Given a list of nodes, combine sequences of multiple adjacent content * nodes into single nodes. */ private function mergeContentNodes(array $nodes) { $list = array(); $content_block = array(); - foreach ($this->getChildren() as $child) { - if ($child->isContentNode()) { - $content_block[] = $child; + foreach ($nodes as $node) { + if ($node->isContentNode()) { + $content_block[] = $node; continue; } $list[] = $content_block; $content_block = array(); - $list[] = $child; + $list[] = $node; } $list[] = $content_block; $results = array(); foreach ($list as $item) { if (!is_array($item)) { $results[] = $item; continue; } if (!$item) { continue; } $parts = array(); foreach ($item as $content_node) { $parts[] = $content_node->getRawString(); } $parts = implode('', $parts); if (!strlen($parts)) { continue; } $results[] = id(new self()) ->setContent($parts) - ->setRawString($parts); + ->setRawHead($parts); } return $results; } } diff --git a/src/parser/html/PhutilHTMLParser.php b/src/parser/html/PhutilHTMLParser.php index 4b4841d..a6e3d63 100644 --- a/src/parser/html/PhutilHTMLParser.php +++ b/src/parser/html/PhutilHTMLParser.php @@ -1,426 +1,430 @@ "). Non-tag // content is anything else. $segment_pos = 0; $segments = array(); $in_tag = false; for ($ii = 0; $ii < strlen($corpus); $ii++) { $c = $corpus[$ii]; if ($in_tag && ($c === '>')) { if ($segment_pos !== null) { $segments[] = array( 'tag' => $in_tag, 'pos' => $segment_pos, 'end' => $ii + 1, ); } $segment_pos = $ii + 1; $in_tag = false; continue; } if (!$in_tag && ($c === '<')) { $segments[] = array( 'tag' => $in_tag, 'pos' => $segment_pos, 'end' => $ii, ); $segment_pos = $ii; $in_tag = true; continue; } } // Add whatever content was left at the end of the string. If we were in // a tag but did not find a closing ">", we treat this as normal content. $segments[] = array( 'tag' => false, 'pos' => $segment_pos, 'end' => $ii, ); // Slice the marked segments out of the raw corpus so we get a list of // "tag" strings and a list of "non-tag" strings. $parts = array(); $corpus_length = strlen($corpus); foreach ($segments as $segment) { $tag = $segment['tag']; $pos = $segment['pos']; $len = $segment['end'] - $pos; // If this is a tag, we'll drop the "<" at the beginning and the ">" // at the end here. if ($tag) { $slice_pos = $pos + 1; $slice_len = $len - 2; } else { $slice_pos = $pos; $slice_len = $len; } if (($slice_pos < $corpus_length) && ($slice_len > 0)) { $content = substr($corpus, $slice_pos, $slice_len); } else { $content = ''; } $parts[] = array( 'tag' => $tag, 'pos' => $pos, 'len' => $len, 'content' => $content, ); } $root = new PhutilDOMNode(); $this->setCursor($root); foreach ($parts as $part) { $tag = $this->newTagDOMNode($part); if ($tag !== null) { continue; } $content = $part['content']; // If this part is a tag, restore the angle brackets. if ($part['tag']) { $content = '<'.$content.'>'; } $node = id(new PhutilDOMNode()) ->setContent($content) - ->setRawString($content); + ->setRawHead($content); $this->getCursor()->appendChild($node); } $root->mergeContent(); return $root; } private function newTagDOMNode(array $part) { if (!$part['tag']) { return null; } $raw_content = $part['content']; $content = $raw_content; $content = trim($content); $content_len = strlen($content); // If the tag content begins with "/", like "", strip the slash // off and mark this as a closing tag. $is_close = false; if ($content_len > 0 && $content[0] === '/') { $is_close = true; $content = substr($content, 1); $content = trim($content); $content_len = strlen($content); } // If the tag content ends with "/", like "", strip the slash off // and mark this as self-closing. $self_close = false; if ($content_len > 0 && $content[$content_len - 1] === '/') { $self_close = true; $content = substr($content, 0, $content_len - 1); $content = trim($content); $content_len = strlen($content); } // If this tag is both a closing tag and a self-closing tag, it is // not formatted correctly. Treat it as content. if ($self_close && $is_close) { return null; } // Now, split the rest of the tag into the tag name and tag attributes. $pieces = preg_split('/\s+/', $content, 2); $tag_name = $pieces[0]; if (count($pieces) > 1) { $attributes = $pieces[1]; } else { $attributes = ''; } // If there's no tag name, this tag is not valid. Treat it as content. if (!strlen($tag_name)) { return null; } // If this is a closing tag with attributes, it's not valid. Treat it // as content. if ($is_close && strlen($attributes)) { return null; } $tag_name = phutil_utf8_strtolower($tag_name); // If we find a valid closing tag, try to find a matching tag on the stack. // If we find a matching tag, close it. // If we do not find a matching tag, treat the closing tag as content. if ($is_close) { $cursor = $this->getCursor(); while ($cursor) { if ($cursor->getTagName() === $tag_name) { + // Add this raw content to the raw content of the tag we're closing. + $cursor->setRawTail('<'.$raw_content.'>'); + $parent = $cursor->getParentNode(); $this->setCursor($parent); + return true; } $cursor = $cursor->getParentNode(); } return null; } if (strlen($attributes)) { $attribute_map = $this->parseAttributes($attributes); // If the attributes can't be parsed, treat the tag as content. if ($attribute_map === null) { return null; } } else { $attribute_map = array(); } $node = id(new PhutilDOMNode()) ->setTagName($tag_name) ->setAttributes($attribute_map) - ->setRawString('<'.$raw_content.'>'); + ->setRawHead('<'.$raw_content.'>'); $cursor = $this->getCursor(); $cursor->appendChild($node); if (!$self_close) { $this->setCursor($node); } return $node; } private function setCursor(PhutilDOMNode $cursor) { $this->cursor = $cursor; return $this; } private function getCursor() { return $this->cursor; } private function parseAttributes($attributes) { $state = 'key'; $whitespace = array( ' ' => true, "\n" => true, "\t" => true, "\r" => true, ); $map = array(); $len = strlen($attributes); $key_pos = null; for ($ii = 0; $ii < $len; $ii++) { $c = $attributes[$ii]; $is_space = isset($whitespace[$c]); switch ($state) { case 'key': // We're looking for the start of an attribute name. // Skip over any whitespace. if ($is_space) { break; } // If we see "". if (isset($map[$name_value])) { return null; } } // If we find an "=", that's the end of the name. Next, we're going // to parse a value. if ($c === '=') { $state = 'value'; break; } // If we find whitespace, that's the end of the name. We're going // to look for an "=". if ($is_space) { $state = 'equals'; break; } break; case 'equals': // We've parsed the name of an attribute and are looking for an // "=" character. // Skip over any whitespace. if ($is_space) { break; } // This is the "=" we're looking for, so we're good to go. if ($c === '=') { $state = 'value'; break; } // If this is anything else, this is an attribute name with no // value. Treat it as "true" and move on. This corresponds to an // input like "". $map[$name_value] = true; $name_pos = $ii; $state = 'name'; break; case 'value': // We've parsed an "=" and are looking for the start of a value. // Skip over any whitespace. if ($is_space) { break; } // Don't accept "parseDocument($input); // We're just testing the child list of the root node since this // reduces the amount of boilerplate in the test cases. $list = array(); foreach ($document->getChildren() as $child) { $list[] = $child->toDictionary(); } $this->assertEqual( $expect, $list, pht('DOM tree for "%s".', $test)); } } + public function testSelectChildrenWithTags() { + $input = 'x'; + $document = id(new PhutilHTMLParser()) + ->parseDocument($input); + + $children = $document->selectChildrenWithTags(array('a')); + + $list = array(); + foreach ($children as $child) { + $list[] = $child->toDictionary(); + } + + $this->assertEqual( + array( + array( + 'tag' => 'a', + 'attributes' => array(), + 'children' => array(), + ), + array( + 'content' => '', + ), + array( + 'tag' => 'a', + 'attributes' => array(), + 'children' => array(), + ), + array( + 'content' => '', + ), + array( + 'tag' => 'a', + 'attributes' => array(), + 'children' => array(), + ), + array( + 'content' => '', + ), + array( + 'tag' => 'a', + 'attributes' => array(), + 'children' => array(), + ), + array( + 'content' => 'x', + ), + array( + 'tag' => 'a', + 'attributes' => array(), + 'children' => array(), + ), + array( + 'content' => '', + ), + ), + $list, + pht('Child selection of: %s.', $input)); + } + }