Finally, a smarter HTML parser

Small, short code snippets that other people may find useful. Do you have a good regex that you would like to share? Share it! Even better, the code can be commented on, and improved.

Moderator: General Moderators

Post Reply
User avatar
Ambush Commander
DevNet Master
Posts: 3698
Joined: Mon Oct 25, 2004 9:29 pm
Location: New Jersey, US

Finally, a smarter HTML parser

Post by Ambush Commander »

It doesn't actually do anything yet. Well, it does. It tokenizes the string, then it checks the elements for well-formedness under my simplified XHTML doctype (with bad stuff like <script> removed). Unit tests too.

The last test is to take the tokens and turn it into text, as well as add parsing for attributes (which will require knowledge of a dozen more RFCs, so I'm not so keen on working on that... yet...)

Code: Select all

<?php

class MarkupLexer
{
    
    function convertShorthand($string) {
       $trans['--'] = "–";
       $trans["’"]  = "'";
       $trans["‘"]  = "'";
       $trans['“']  = """;
       $trans['”']  = """;
       $trans['…']  = "...";
       return strtr($string, $trans);
    }
    
    function nextQuote($string, $offset = 0) {
        $quotes = array('"', "'");
        return $this->next($string, $quotes, $offset);
    }
    
    function nextWhiteSpace($string, $offset = 0) {
        $spaces = array(chr(0x20), chr(0x9), chr(0xD), chr(0xA));
        return $this->next($string, $spaces, $offset);
    }
    
    function next($haystack, $needles, $offset = 0) {
        if (is_string($needles)) {
            $string_needles = $needles;
            $needles = array();
            $size = strlen($string_needles);
            for ($i = 0; $i < $size; $i++) {
                $needles[] = $string_needles{$i};
            }
        }
        $positions = array();
        foreach ($needles as $needle) {
            $position = strpos($haystack, $needle, $offset);
            if ($position !== false) {
                $positions[] = $position;
            }
        }
        return empty($positions) ? false : min($positions);
    }
    
    function tokenizeHTML($string) {
        
        // some quick checking (if empty, return empty)
        $string = (string) $string;
        if ($string == '') return array();
        
        $cursor = 0; // our location in the text
        $inside_tag = false; // whether or not we're parsing the inside of a tag
        $array = array(); // result array
        
        while(true) {
            
            $position_next_lt = strpos($string, '<', $cursor);
            $position_next_gt = strpos($string, '>', $cursor);
            
            // triggers on "<b>asdf</b>" but not "asdf <b></b>"
            if ($position_next_lt === $cursor) {
                $inside_tag = true;
                $cursor++;
            }
            
            if (!$inside_tag && $position_next_lt !== false) {
                // We are not inside tag and there still is another tag to parse
                $array[] = new HTML_Text(substr($string, $cursor, $position_next_lt - $cursor));
                $cursor  = $position_next_lt + 1;
                $inside_tag = true;
                continue;
            } elseif (!$inside_tag) {
                // We are not inside tag but there are no more tags
                // If we're already at the end, break
                if ($cursor === strlen($string)) break;
                // Create Text of rest of string
                $array[] = new HTML_Text(substr($string, $cursor));
                break;
            } elseif ($inside_tag && $position_next_gt !== false) {
                // We are in tag and it is well formed
                // Grab the internals of the tag
                $segment = substr($string, $cursor, $position_next_gt - $cursor);
                
                // Check if it's a comment
                if (substr($segment,0,3) == '!--' && substr($segment,strlen($segment)-2,2) == '--') {
                    $array[] = new HTML_Comment(substr($segment,3,strlen($segment)-5));
                    $inside_tag = false;
                    $cursor = $position_next_gt + 1;
                    continue;
                }
                
                // Check if it's an end tag
                $is_end_tag = (strpos($segment,'/') === 0);
                if ($is_end_tag) {
                    $type = substr($segment, 1);
                    $array[] = new HTML_EndTag($type);
                    $inside_tag = false;
                    $cursor = $position_next_gt + 1;
                    continue;
                }
                
                // Check if it is self closing, if so, remove trailing slash
                $is_self_closing = (strpos($segment,'/') === strlen($segment) - 1);
                if ($is_self_closing) {
                    $segment = substr($segment, 0, strlen($segment) - 1);
                }
                
                // Check if there are any attributes
                $position_first_space = $this->nextWhiteSpace($segment);
                if ($position_first_space === false) {
                    if ($is_self_closing) {
                        $array[] = new HTML_EmptyTag($segment);
                    } else {
                        $array[] = new HTML_StartTag($segment, array());
                    }
                    $inside_tag = false;
                    $cursor = $position_next_gt + 1;
                    continue;
                }
                
                // Grab out all the data
                $type = substr($segment, 0, $position_first_space);
                $attribute_string = trim(substr($segment, $position_first_space));
                $attributes = $this->tokenizeAttributeString($attribute_string);
                if ($is_self_closing) {
                    $array[] = new HTML_EmptyTag($type, $attributes);
                } else {
                    $array[] = new HTML_StartTag($type, $attributes);
                }
                $cursor = $position_next_gt + 1;
                $inside_tag = false;
                continue;
            } else {
                $array[] = new HTML_Text('<' . substr($string, $cursor));
                break;
            }
            break;
        }
        return $array;
    }
    
    function tokenizeAttributeString($string) {
        $string = (string) $string;
        if ($string == '') return array();
        
        $array = array();
        $cursor = 0;
        $in_value = false;
        $i = 0;
        $size = strlen($string);
        while(true) {
            if ($cursor >= $size) {
                break;
            }
            $position_next_space = $this->nextWhiteSpace($string, $cursor);
            //scroll to the last whitespace before text
            while ($position_next_space === $cursor) {
                $cursor++;
                $position_next_space = $this->nextWhiteSpace($string, $cursor);
            }
            $position_next_equal = strpos($string, '=', $cursor);
            if ($position_next_equal !== false &&
                 ($position_next_equal < $position_next_space ||
                  $position_next_space === false)) {
                //attr="asdf"
                $key = trim(substr($string, $cursor, $position_next_equal - $cursor));
                $position_next_quote = $this->nextQuote($string, $cursor);
                $quote = $string{$position_next_quote};
                $position_end_quote = strpos($string, $quote, $position_next_quote + 1);
                $value = substr($string, $position_next_quote + 1,
                  $position_end_quote - $position_next_quote - 1);
                if ($key) {
                    $array[$key] = $value;
                }
                $cursor = $position_end_quote + 1;
            } else {
                //boolattr
                if ($position_next_space === false) {
                    $position_next_space = $size;
                }
                $key = substr($string, $cursor, $position_next_space - $cursor);
                if ($key) {
                    $array[$key] = $key;
                }
                $cursor = $position_next_space + 1;
            }
        }
        return $array;
    }
    
}

?>

Code: Select all

<?php

class DTD_XHTML_1_0_Simplified
{
    
    var $info;
    
    function DTD_XHTML_1_0_Simplified() {
        
        $entity['special.extra'] = array('img');
        $entity['special.basic'] = array('br','bdo','span');
        $entity['special'] = array_merge($entity['special.basic'],
          $entity['special.extra']);
        
        $entity['fontstyle.extra'] = array('big','small');
        $entity['fontstyle.basic'] = array('tt','i','b','u','s','strike');
        $entity['fontstyle'] = array_merge($entity['fontstyle.extra'],
          $entity['fontstyle.basic']);
        
        $entity['phrase.extra'] = array('sub','sup');
        $entity['phrase.basic'] = array('em','strong','dfn','code','samp','kbd',
          'var','cite','abbr','acronym','q');
        $entity['phrase'] = array_merge($entity['phrase.extra'],
          $entity['phrase.basic']);
        
        $entity['misc.inline'] = array('ins','del');
        $entity['misc'] = $entity['misc.inline'];
        
        $entity['inline'] = array_merge(array('a'), $entity['special'],
          $entity['fontstyle'], $entity['phrase']);
        
        $entity['heading'] = array('h1','h2','h3','h4','h5','h6');
        $entity['lists'] = array('ul','ol', 'dl');
        $entity['blocktext'] = array('pre','hr','blockquote','address');
        
        $entity['block'] = array_merge(array('p','div','table'),
          $entity['heading'],$entity['lists'], $entity['blocktext']);
        
        $entity['Inline'] = array_merge(array('#PCDATA'),$entity['special'],
          $entity['misc.inline']);
        $entity['Flow'] = array_merge(array('#PCDATA'), $entity['block'],
          $entity['inline'], $entity['misc']);
        $entity['a.content'] = array_merge(array('#PCDATA'), $entity['special'],
          $entity['fontstyle'], $entity['phrase'], $entity['misc.inline']);
        
        $entity['pre.content'] = array_merge(array('#PCDATA', 'a'),
          $entity['special.basic'], $entity['fontstyle.basic'],
          $entity['phrase.basic'], $entity['misc.inline']);
        
        $this->info['ins'] =
        $this->info['del'] = 
        $this->info['blockquote'] =
        $this->info['dd']  =
        $this->info['div'] = array($entity['Flow']);
        
        $this->info['em']  =
        $this->info['strong'] =
        $this->info['dfn']  =
        $this->info['code'] =
        $this->info['samp'] =
        $this->info['kbd']  =
        $this->info['var']  =
        $this->info['code'] =
        $this->info['samp'] =
        $this->info['kbd']  =
        $this->info['var']  =
        $this->info['cite'] =
        $this->info['abbr'] =
        $this->info['acronym'] =
        $this->info['q']    =
        $this->info['sub']  =
        $this->info['tt']   =
        $this->info['sup']  =
        $this->info['i'] =
        $this->info['b'] =
        $this->info['big'] =
        $this->info['small'] =
        $this->info['u'] =
        $this->info['s'] =
        $this->info['strike'] =
        $this->info['bdo'] =
        $this->info['span']=
        $this->info['dt']  =
        $this->info['p']   = 
        $this->info['h1']  = 
        $this->info['h2']  = 
        $this->info['h3']  = 
        $this->info['h4']  = 
        $this->info['h5']  = 
        $this->info['h6']  = array($entity['Inline']);
        
        $this->info['ol']  =
        $this->info['ul']  = array(array('li'),array(),'+');
        
        $this->info['dl']  = array(array('dt','dd'));
        $this->info['address'] = array(array_merge(array('#PCDATA', 'p'),
          $entity['inline'], $entity['misc.inline']));
        
        $this->info['img'] =
        $this->info['br']  =
        $this->info['hr']  = 'EMPTY';
        
        $this->info['pre'] = array($entity['pre.content']);
        
        $this->info['a'] = array($entity['a.content']);
        
    }
    
    function makeTokensWellFormed($tokens) {
        
        $opened_tags = array();
        $k = 0;
        
        $array = array();
        $size = count($tokens);
        
        for ($i = $j = 0; $i < $size; $i++, $j++) {
            if (!is_a($tokens[$i], 'HTML_Text') && !is_a($tokens[$i], 'HTML_Comment')) {
                $tokens[$i]->selfToLower();
            }
            
            if (is_a($tokens[$i], 'HTML_StartTag')
             || is_a($tokens[$i], 'HTML_EmptyTag')
             || is_a($tokens[$i], 'HTML_Text')) {
                if($k > 0) {
                    $parent = $opened_tags[$k - 1];
                } else {
                    $parent = '#ROOT';
                }
                if (!$this->isTagAllowed($tokens[$i], $parent)) {
                    $j--;
                    continue;
                }
                if (is_a($tokens[$i], 'HTML_StartTag')) {
                    $opened_tags[$k]  = $tokens[$i];
                    $k++;
                }
            } elseif (is_a($tokens[$i], 'HTML_EndTag')) {
                if (count($opened_tags) == 0 ||
                  $opened_tags[$k-1]->getType() != $tokens[$i]->getType()) {
                    $j--;
                    continue;
                }
                unset($opened_tags[$k-1]);
                $k--;
            }
            $array[$j]      = $tokens[$i];
            
        }
        
        $size = count($opened_tags);
        
        for($i = $size - 1; $i >= 0; $i--) {
            $array[] = new HTML_EndTag($opened_tags[$i]->getType());
        }
        
        return $array;
        
    }
    
    function isTagAllowed($tag, $parent) {
        if ($parent === '#ROOT' || $parent === null) {
            $parent = new HTML_StartTag('div');
        }
        $parent_type = $parent->getType();
        $type = $tag->getType();
        
        // check if is real tag
        if ($type !== '#PCDATA' && !isset($this->info[$type])) return false;
        
        $parent_info = $this->info[$parent_type];
        
        return in_array($type, $parent_info[0]) &&
          (empty($parent_info[1]) || !in_array($type, $parent_info[1]));
        
    }
    
}

?>
Unit tests, of course...

Code: Select all

<?php

class SimpleTest_MarkupLexer extends UnitTestCase
{
    
    var $MarkupLexer;
    
    function setUp() {
        $this->MarkupLexer =& new MarkupLexer();
    }
    
    function test_convertShorthand() {
        
        $test_array = array(
             array('The dog--the cat.',     'The dog–the cat.')
            ,array('Perhaps… it was fate!', 'Perhaps... it was fate!')
            ,array('He said, “What?”',      'He said, "What?"')
            ,array('‘Camelot’',             '\'Camelot\'')
        );
        
        foreach ($test_array as $input_expect) {
            $input  = $input_expect[0];
            $expect = $input_expect[1];
            $result = $this->MarkupLexer->convertShortHand($input);
            $this->assertEqual($expect, $result);
        }
        
    }
    
    function test_nextWhiteSpace() {
        $HP =& $this->MarkupLexer;
        $this->assertIdentical(false, $HP->nextWhiteSpace('asdf'));
        $this->assertIdentical(0, $HP->nextWhiteSpace(' asdf'));
        $this->assertIdentical(0, $HP->nextWhiteSpace("\nasdf"));
        $this->assertIdentical(1, $HP->nextWhiteSpace("a\tsdf"));
        $this->assertIdentical(4, $HP->nextWhiteSpace("asdf\r"));
        $this->assertIdentical(2, $HP->nextWhiteSpace("as\t\r\nasdf as"));
    }
    
    function test_tokenizeHTML() {
        
        $input[] = '';
        $expect[] = array();
        
        $input[] = 'This is regular text.';
        $expect[] = array(
            new HTML_Text('This is regular text.')
            );
        
        $input[] = 'This is <b>bold</b> text';
        $expect[] = array(
            new HTML_Text('This is ')
           ,new HTML_StartTag('b', array())
           ,new HTML_Text('bold')
           ,new HTML_EndTag('b')
           ,new HTML_Text(' text')
            );
        
        $input[] = '<DIV>Totally rad dude. <b>asdf</b></div>';
        $expect[] = array(
            new HTML_StartTag('DIV', array())
           ,new HTML_Text('Totally rad dude. ')
           ,new HTML_StartTag('b', array())
           ,new HTML_Text('asdf')
           ,new HTML_EndTag('b')
           ,new HTML_EndTag('div')
            );
        
        $input[] = '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>';
        $expect[] = array(
            new HTML_StartTag('asdf')
           ,new HTML_EndTag('asdf')
           ,new HTML_StartTag('d')
           ,new HTML_EndTag('d')
           ,new HTML_StartTag('poOloka')
           ,new HTML_StartTag('poolasdf')
           ,new HTML_StartTag('ds')
           ,new HTML_EndTag('asdf')
           ,new HTML_EndTag('ASDF')
            );
        
        $input[] = '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>';
        $expect[] = array(
            new HTML_StartTag('a',array('href'=>'foobar.php','title'=>'foo!'))
           ,new HTML_Text('Link to ')
           ,new HTML_StartTag('b',array('id'=>'asdf'))
           ,new HTML_Text('foobar')
           ,new HTML_EndTag('b')
           ,new HTML_EndTag('a')
            );
        
        $input[] = '<br />';
        $expect[] = array(
            new HTML_EmptyTag('br')
            );
        
        $input[] = '<!-- Comment --> <!-- not so well formed --->';
        $expect[] = array(
            new HTML_Comment(' Comment ')
           ,new HTML_Text(' ')
           ,new HTML_Comment(' not so well formed -')
            );
        
        $input[] = '<a href=""';
        $expect[] = array(
            new HTML_Text('<a href=""')
            );
        
        $size = count($input);
        for($i = 0; $i < $size; $i++) {
            $result = $this->MarkupLexer->tokenizeHTML($input[$i]);
            $this->assertEqual($expect[$i], $result);
            paintIf($result, $expect[$i] != $result);
        }
        
    }
    
    function test_tokenizeAttributeString() {
        
        $input[] = 'href="asdf" boom="assdf"';
        $expect[] = array('href'=>'asdf', 'boom'=>'assdf');
        
        $input[] = "href='r'";
        $expect[] = array('href'=>'r');
        
        $input[] = 'onclick="javascript:alert(\'asdf\');"';
        $expect[] = array('onclick' => "javascript:alert('asdf');");
        
        $input[] = 'selected';
        $expect[] = array('selected'=>'selected');
        
        $input[] = '="asdf"';
        $expect[] = array();
        
        $size = count($input);
        for($i = 0; $i < $size; $i++) {
            $result = $this->MarkupLexer->tokenizeAttributeString($input[$i]);
            $this->assertEqual($expect[$i], $result);
            paintIf($result, $expect[$i] != $result);
        }
        
    }
    
    
}

?>

Code: Select all

<?php

class SimpleTest_DTD_XHTML_1_0_Simplified extends UnitTestCase
{
    
    function test_makeTokensWellFormed() {
        
        $DTD =& new DTD_XHTML_1_0_Simplified();
        
        $input[] = array(
            new HTML_StartTag('b')
           ,new HTML_Text('Bold text')
            );
        $expect[] = array(
            new HTML_StartTag('b')
           ,new HTML_Text('Bold text')
           ,new HTML_EndTag('b')
            );
        
        $input[] = array(
            new HTML_Text('Bold text')
           ,new HTML_EndTag('b')
           ,new HTML_Text('asdf')
            );
        $expect[] = array(
            new HTML_Text('Bold text')
           ,new HTML_Text('asdf')
            );
        
        $input[] = array(
            new HTML_StartTag('b')
           ,new HTML_StartTag('div')
           ,new HTML_EndTag('div')
           ,new HTML_EndTag('b')
            );
        $expect[] = array(
            new HTML_StartTag('b')
           ,new HTML_EndTag('b')
            );
        
        $MarkupLexer =& new MarkupLexer();
        $tokens = $MarkupLexer->tokenizeHTML(
        '<html><body><h1>A title</h1><p>Let us begin</p>
        <p>asdf<p>asdf</p></p><span><div>asdf</div></span></body></html>'
        );
        $tokens = $DTD->makeTokensWellFormed($tokens);
        //var_dump($tokens);
        
        /*
        $input[] = array(
            
            );
        $expect[] = array(
            
            );
        */
        
        $size = count($input);
        for($i = 0; $i < $size; $i++) {
            $result = $DTD->makeTokensWellFormed($input[$i]);
            $this->assertEqual($expect[$i],$result);
            paintIf($result, $expect[$i] != $result);
        }
        
    }
    
}

?>
And a lot of stuff that can usually get factored out. Plus, the need for dtd compilation (so the $info array doesn't have to get calculated every time) and a few more dtds.

What do you think?

Edit

Forgot a few more classes...

Code: Select all

<?php

class HTML_Comment
{
    
    var $value;
    
    function HTML_Comment($value) {
        $this->value = $value;
    }
    
}

?>

Code: Select all

<?php

class HTML_Tag
{
    
    var $type;
    
    function HTML_Tag($type) {
        $this->type = $type;
    }
    
    function selfToLower() {
        $this->type = strtolower($this->type);
    }
    
    function getType() {
        return $this->type;
    }
    
}

class HTML_StartTag extends HTML_Tag
{
    
    var $attributes = array();
    
    function HTML_StartTag($type, $attributes = array()) {
        $this->HTML_Tag($type);
        $this->attributes = $attributes;
    }
    
}

class HTML_EmptyTag extends HTML_StartTag
{
    
}

class HTML_EndTag extends HTML_Tag
{
    
}

?>

Code: Select all

<?php

class HTML_Text
{
    
    var $value;
    
    function HTML_Text($value) {
        $this->value = $value;
    }
    
    function getType() {return '#PCDATA';}
    
    function getValue() {return $this->value;}
    function addToValue($string) {$this->value .= $string;}
    
}

?>
Plus, I need to rename the function to something a bit more accurate.
Post Reply