diff --git a/src/parser/html/PhutilDOMNode.php b/src/parser/html/PhutilDOMNode.php index 5b87ae3..84781c7 100644 --- a/src/parser/html/PhutilDOMNode.php +++ b/src/parser/html/PhutilDOMNode.php @@ -1,193 +1,208 @@ content = $content; return $this; } public function getContent() { return $this->content; } public function isContentNode() { return ($this->content !== null); } public function setTagName($tag_name) { $this->tagName = $tag_name; return $this; } public function getTagName() { return $this->tagName; } public function appendChild(PhutilDOMNode $node) { $node->parentNode = $this; $this->children[] = $node; return $this; } public function getChildren() { return $this->children; } public function getParentNode() { return $this->parentNode; } public function setAttributes(array $attributes) { $this->attributes = $attributes; return $this; } public function getAttributes() { return $this->attributes; } - public function setRawString($raw_string) { - $this->rawString = $raw_string; + public function setRawHead($raw_string) { + $this->rawHead = $raw_string; + return $this; + } + + public function setRawTail($raw_tail) { + $this->rawTail = $raw_tail; return $this; } public function getRawString() { - return $this->rawString; + $raw = array(); + $raw[] = $this->rawHead; + + foreach ($this->getChildren() as $child) { + $raw[] = $child->getRawString(); + } + + $raw[] = $this->rawTail; + + return implode('', $raw); } public function toDictionary() { if ($this->isContentNode()) { return array( 'content' => $this->content, ); } else { $children = array(); foreach ($this->getChildren() as $child) { $children[] = $child->toDictionary(); } return array( 'tag' => $this->getTagName(), 'attributes' => $this->getAttributes(), 'children' => $children, ); } } /** * Get a list of the children of a given DOM node, treating unexpected * tags as if they were raw content. */ public function selectChildrenWithTags(array $tag_list) { $tag_map = array_fuse($tag_list); $nodes = array(); foreach ($this->getChildren() as $child) { // If this is already a content node, just keep it as-is. if ($child->isContentNode()) { $nodes[] = $child; continue; } $tag_name = $child->getTagName(); // If this is a tag that we're allowing, keep it as-is. if (isset($tag_map[$tag_name])) { $nodes[] = $child; continue; } // Otherwise, this is some other tag. Convert it into a content // node. - $raw_content = $child->getRawString(); + $raw_string = $child->getRawString(); $nodes[] = id(new self()) - ->setContent($raw_content) - ->setRawContent($raw_content); + ->setContent($raw_string) + ->setRawHead($raw_string); } return $this->mergeContentNodes($nodes); } public function getRawContentString() { $content_node = $this->selectChildrenWithTags(array()); if (!$content_node) { return ''; } return head($content_node)->getRawString(); } public function mergeContent() { $this->children = $this->mergeContentNodes($this->children); foreach ($this->getChildren() as $child) { $child->parentNode = $this; $child->mergeContent(); } return $this; } /** * Given a list of nodes, combine sequences of multiple adjacent content * nodes into single nodes. */ private function mergeContentNodes(array $nodes) { $list = array(); $content_block = array(); - foreach ($this->getChildren() as $child) { - if ($child->isContentNode()) { - $content_block[] = $child; + foreach ($nodes as $node) { + if ($node->isContentNode()) { + $content_block[] = $node; continue; } $list[] = $content_block; $content_block = array(); - $list[] = $child; + $list[] = $node; } $list[] = $content_block; $results = array(); foreach ($list as $item) { if (!is_array($item)) { $results[] = $item; continue; } if (!$item) { continue; } $parts = array(); foreach ($item as $content_node) { $parts[] = $content_node->getRawString(); } $parts = implode('', $parts); if (!strlen($parts)) { continue; } $results[] = id(new self()) ->setContent($parts) - ->setRawString($parts); + ->setRawHead($parts); } return $results; } } diff --git a/src/parser/html/PhutilHTMLParser.php b/src/parser/html/PhutilHTMLParser.php index 4b4841d..a6e3d63 100644 --- a/src/parser/html/PhutilHTMLParser.php +++ b/src/parser/html/PhutilHTMLParser.php @@ -1,426 +1,430 @@ "). Non-tag // content is anything else. $segment_pos = 0; $segments = array(); $in_tag = false; for ($ii = 0; $ii < strlen($corpus); $ii++) { $c = $corpus[$ii]; if ($in_tag && ($c === '>')) { if ($segment_pos !== null) { $segments[] = array( 'tag' => $in_tag, 'pos' => $segment_pos, 'end' => $ii + 1, ); } $segment_pos = $ii + 1; $in_tag = false; continue; } if (!$in_tag && ($c === '<')) { $segments[] = array( 'tag' => $in_tag, 'pos' => $segment_pos, 'end' => $ii, ); $segment_pos = $ii; $in_tag = true; continue; } } // Add whatever content was left at the end of the string. If we were in // a tag but did not find a closing ">", we treat this as normal content. $segments[] = array( 'tag' => false, 'pos' => $segment_pos, 'end' => $ii, ); // Slice the marked segments out of the raw corpus so we get a list of // "tag" strings and a list of "non-tag" strings. $parts = array(); $corpus_length = strlen($corpus); foreach ($segments as $segment) { $tag = $segment['tag']; $pos = $segment['pos']; $len = $segment['end'] - $pos; // If this is a tag, we'll drop the "<" at the beginning and the ">" // at the end here. if ($tag) { $slice_pos = $pos + 1; $slice_len = $len - 2; } else { $slice_pos = $pos; $slice_len = $len; } if (($slice_pos < $corpus_length) && ($slice_len > 0)) { $content = substr($corpus, $slice_pos, $slice_len); } else { $content = ''; } $parts[] = array( 'tag' => $tag, 'pos' => $pos, 'len' => $len, 'content' => $content, ); } $root = new PhutilDOMNode(); $this->setCursor($root); foreach ($parts as $part) { $tag = $this->newTagDOMNode($part); if ($tag !== null) { continue; } $content = $part['content']; // If this part is a tag, restore the angle brackets. if ($part['tag']) { $content = '<'.$content.'>'; } $node = id(new PhutilDOMNode()) ->setContent($content) - ->setRawString($content); + ->setRawHead($content); $this->getCursor()->appendChild($node); } $root->mergeContent(); return $root; } private function newTagDOMNode(array $part) { if (!$part['tag']) { return null; } $raw_content = $part['content']; $content = $raw_content; $content = trim($content); $content_len = strlen($content); // If the tag content begins with "/", like "", strip the slash // off and mark this as a closing tag. $is_close = false; if ($content_len > 0 && $content[0] === '/') { $is_close = true; $content = substr($content, 1); $content = trim($content); $content_len = strlen($content); } // If the tag content ends with "/", like "