...

/

Parsing Fragmented HTML Documents

Parsing Fragmented HTML Documents

Learn how to implement a fragmented HTML document parser.

Implementing the FragmentsDocumentParser class

We have enough background information to look at our FragmentsDocumentParser implementation:

Press + to interact
<?php
class FragmentsDocumentParser extends BaseFragmentParser
{
private $fragments = [];
protected $ignoreRanges = [];
private $isParsingScript = false;
public function setIgnoreRanges($ignoreRanges)
{
$this->ignoreRanges = $ignoreRanges;
}
private function parseFragment($index)
{
$this->buffer = '';
$brokeEarly = false;
for ($i = $index; $i < count($this->string); $i++) {
if (array_key_exists($i, $this->ignoreRanges)) {
$jumpIndex = $this->ignoreRanges[$i];
$this->buffer .= str(' ')->repeat($jumpIndex - $i + 1);
$i = $jumpIndex;
continue;
}
$this->checkCurrentOffsets($i);
if (($i != $index && $this->current == '<') ||
($this->current != '>' && $this->next == null)) {
$brokeEarly = true;
break;
}
if ($this->isStartOfString()) {
$i = $this->scanToEndOfString($i);
continue;
}
if ($this->current == '>') {
$this->buffer .= $this->current;
break;
}
$this->buffer .= $this->current;
}
if ($brokeEarly) {
$this->buffer = '';
return;
}
$fragment = new HtmlFragment();
$fragment->startPosition = $index;
$fragment->endPosition = $this->position;
$fragment->content = $this->buffer;
$fragment->isSelfClosing = str($this->buffer)->endsWith('/>');
$fragment->isClosingTag = str($this->buffer)->startsWith('</');
$documentContentStartOffset = 1;
$documentContentEndOffset = -1;
if ($fragment->isClosingTag) {
$documentContentStartOffset = 2;
}
if ($fragment->isSelfClosing) {
$documentContentEndOffset = -2;
}
$fragment->documentContent =
str($this->buffer)->substr(
$documentContentStartOffset,
$documentContentEndOffset
);
// Retrieve all content before the first space.
$tagName = str($fragment->documentContent)->before(' ');
$fragment->tagName = (string) $tagName->trim();
// Check if the tag name was an ignored region.
// If so, we can grab that content substring.
if (array_key_exists(
$fragment->startPosition + $documentContentStartOffset
, $this->ignoreRanges
)) {
$tagNameStart =
$fragment->startPosition + $documentContentStartOffset;
$tagNameEnd = $this->ignoreRanges[$tagNameStart];
$nameLength = $tagNameEnd - $tagNameStart + 1;
$fragment->tagName = str($this->string)
->substr($tagNameStart, $nameLength)->value();
}
// Create a Fragment representing the name.
if (str($fragment->tagName)->trim()->length > 0) {
$fragment->name = new Fragment();
$fragment->name->content = $fragment->tagName;
$fragment->name->startPosition =
$fragment->startPosition + $documentContentStartOffset;
$fragment->name->endPosition =
$fragment->name->startPosition +
str($fragment->tagName)->length() - 1;
}
// Calculate the start of the inner content.
// This will be the first space after the tag name.
$innerContentStart = mb_strpos($fragment->documentContent,
' ');
if ($innerContentStart !== false) {
$innerContentFragment = new Fragment();
$innerContentFragment->content =
str($fragment->documentContent)
->substr($innerContentStart)
->trim();
// Calculate the start and end positions of the
// inner content relative to the document.
$innerContentFragment->startPosition = mb_strpos(
$fragment->documentContent,
$innerContentFragment->content
) + 1 + $fragment->startPosition;
if ($fragment->isClosingTag) {
$innerContentFragment->startPosition += 1;
}
$innerContentFragment->endPosition =
$innerContentFragment->startPosition +
str($innerContentFragment->content)->length();
$fragment->innerContent = $innerContentFragment;
}
if (!$fragment->isClosingTag && !$fragment->isSelfClosing &&
str($fragment->tagName)->lower == 'script'
) {
$this->isParsingScript = true;
}
$this->fragments[] = $fragment;
}
/**
* @return array
*/
private function buildFragmentIndex()
{
preg_match_all('/</', $this->string,
$matches, PREG_OFFSET_CAPTURE);
$fragmentStarts = [];
foreach ($matches[0] as $match) {
$index = $match[1];
$isValid = true;
foreach ($this->ignoreRanges as $rangeStart => $rangeEnd) {
if ($index >= $rangeStart && $index <= $rangeEnd) {
$isValid = false;
break;
}
}
if ($isValid) {
$fragmentStarts[] = $index;
}
}
return $fragmentStarts;
}
public function getFragments()
{
return $this->fragments;
}
public function parse($value)
{
$this->isParsingScript = false;
$this->string = new Utf8StringIterator(
Str::normalizeEol($value, "\n")
);
$fragmentStartIndexes = $this->buildFragmentIndex();
$fragmentCount = count($fragmentStartIndexes);
for ($i = 0; $i < $fragmentCount; $i++) {
$this->parseFragment($fragmentStartIndexes[$i]);
$this->resetState();
if ($this->isParsingScript) {
for ($j = $i + 1; $j < $fragmentCount; $j++) {
$start = $fragmentStartIndexes[$j];
$check = str($this->string)
->substr($start, 8)
->lower();
if ($check == '</script') {
$this->isParsingScript = false;
$i = $j - 1;
break;
}
}
}
}
return $this->fragments;
}
}

The code above has many moving parts, but it is primarily the same techniques and methods we have seen earlier. Because of this, we will discuss the more exciting aspects of our implementation. The first aspect we will examine is how our ignore ranges are managed internally. Our setIgnoreRanges method, defined between lines 11 and 14, sets our internal list of ranges to ignore. These ranges are utilized within the buildFragmentIndex, defined between lines 153 and 177. This method is responsible for using our regular expression to determine the start offset of all potential HTML tags within the source document. We iterate these offsets between lines 160 and 174 to build a new list of all the offsets we should start parsing within our input document. We compare all of the offsets from our regular expression to the list of ranges we should ignore between lines 164 and 169. If the start offset captured by our regular expression is contained within one of the ranges to ignore, we do not add it to our final list of fragment start locations.

The final list of fragment start locations is used within our parse method, which is defined between lines 184 and 215. The first few lines of our method work to manage some internal state and set our internal Utf8StringIterator instance. ...