The last test is to take the tokens and turn it into text, as well as add parsing for attributes (which will require knowledge of a dozen more RFCs, so I'm not so keen on working on that... yet...)
Code: Select all
<?php
class MarkupLexer
{
function convertShorthand($string) {
$trans['--'] = "–";
$trans["’"] = "'";
$trans["‘"] = "'";
$trans['“'] = """;
$trans['”'] = """;
$trans['…'] = "...";
return strtr($string, $trans);
}
function nextQuote($string, $offset = 0) {
$quotes = array('"', "'");
return $this->next($string, $quotes, $offset);
}
function nextWhiteSpace($string, $offset = 0) {
$spaces = array(chr(0x20), chr(0x9), chr(0xD), chr(0xA));
return $this->next($string, $spaces, $offset);
}
function next($haystack, $needles, $offset = 0) {
if (is_string($needles)) {
$string_needles = $needles;
$needles = array();
$size = strlen($string_needles);
for ($i = 0; $i < $size; $i++) {
$needles[] = $string_needles{$i};
}
}
$positions = array();
foreach ($needles as $needle) {
$position = strpos($haystack, $needle, $offset);
if ($position !== false) {
$positions[] = $position;
}
}
return empty($positions) ? false : min($positions);
}
function tokenizeHTML($string) {
// some quick checking (if empty, return empty)
$string = (string) $string;
if ($string == '') return array();
$cursor = 0; // our location in the text
$inside_tag = false; // whether or not we're parsing the inside of a tag
$array = array(); // result array
while(true) {
$position_next_lt = strpos($string, '<', $cursor);
$position_next_gt = strpos($string, '>', $cursor);
// triggers on "<b>asdf</b>" but not "asdf <b></b>"
if ($position_next_lt === $cursor) {
$inside_tag = true;
$cursor++;
}
if (!$inside_tag && $position_next_lt !== false) {
// We are not inside tag and there still is another tag to parse
$array[] = new HTML_Text(substr($string, $cursor, $position_next_lt - $cursor));
$cursor = $position_next_lt + 1;
$inside_tag = true;
continue;
} elseif (!$inside_tag) {
// We are not inside tag but there are no more tags
// If we're already at the end, break
if ($cursor === strlen($string)) break;
// Create Text of rest of string
$array[] = new HTML_Text(substr($string, $cursor));
break;
} elseif ($inside_tag && $position_next_gt !== false) {
// We are in tag and it is well formed
// Grab the internals of the tag
$segment = substr($string, $cursor, $position_next_gt - $cursor);
// Check if it's a comment
if (substr($segment,0,3) == '!--' && substr($segment,strlen($segment)-2,2) == '--') {
$array[] = new HTML_Comment(substr($segment,3,strlen($segment)-5));
$inside_tag = false;
$cursor = $position_next_gt + 1;
continue;
}
// Check if it's an end tag
$is_end_tag = (strpos($segment,'/') === 0);
if ($is_end_tag) {
$type = substr($segment, 1);
$array[] = new HTML_EndTag($type);
$inside_tag = false;
$cursor = $position_next_gt + 1;
continue;
}
// Check if it is self closing, if so, remove trailing slash
$is_self_closing = (strpos($segment,'/') === strlen($segment) - 1);
if ($is_self_closing) {
$segment = substr($segment, 0, strlen($segment) - 1);
}
// Check if there are any attributes
$position_first_space = $this->nextWhiteSpace($segment);
if ($position_first_space === false) {
if ($is_self_closing) {
$array[] = new HTML_EmptyTag($segment);
} else {
$array[] = new HTML_StartTag($segment, array());
}
$inside_tag = false;
$cursor = $position_next_gt + 1;
continue;
}
// Grab out all the data
$type = substr($segment, 0, $position_first_space);
$attribute_string = trim(substr($segment, $position_first_space));
$attributes = $this->tokenizeAttributeString($attribute_string);
if ($is_self_closing) {
$array[] = new HTML_EmptyTag($type, $attributes);
} else {
$array[] = new HTML_StartTag($type, $attributes);
}
$cursor = $position_next_gt + 1;
$inside_tag = false;
continue;
} else {
$array[] = new HTML_Text('<' . substr($string, $cursor));
break;
}
break;
}
return $array;
}
function tokenizeAttributeString($string) {
$string = (string) $string;
if ($string == '') return array();
$array = array();
$cursor = 0;
$in_value = false;
$i = 0;
$size = strlen($string);
while(true) {
if ($cursor >= $size) {
break;
}
$position_next_space = $this->nextWhiteSpace($string, $cursor);
//scroll to the last whitespace before text
while ($position_next_space === $cursor) {
$cursor++;
$position_next_space = $this->nextWhiteSpace($string, $cursor);
}
$position_next_equal = strpos($string, '=', $cursor);
if ($position_next_equal !== false &&
($position_next_equal < $position_next_space ||
$position_next_space === false)) {
//attr="asdf"
$key = trim(substr($string, $cursor, $position_next_equal - $cursor));
$position_next_quote = $this->nextQuote($string, $cursor);
$quote = $string{$position_next_quote};
$position_end_quote = strpos($string, $quote, $position_next_quote + 1);
$value = substr($string, $position_next_quote + 1,
$position_end_quote - $position_next_quote - 1);
if ($key) {
$array[$key] = $value;
}
$cursor = $position_end_quote + 1;
} else {
//boolattr
if ($position_next_space === false) {
$position_next_space = $size;
}
$key = substr($string, $cursor, $position_next_space - $cursor);
if ($key) {
$array[$key] = $key;
}
$cursor = $position_next_space + 1;
}
}
return $array;
}
}
?>Code: Select all
<?php
class DTD_XHTML_1_0_Simplified
{
var $info;
function DTD_XHTML_1_0_Simplified() {
$entity['special.extra'] = array('img');
$entity['special.basic'] = array('br','bdo','span');
$entity['special'] = array_merge($entity['special.basic'],
$entity['special.extra']);
$entity['fontstyle.extra'] = array('big','small');
$entity['fontstyle.basic'] = array('tt','i','b','u','s','strike');
$entity['fontstyle'] = array_merge($entity['fontstyle.extra'],
$entity['fontstyle.basic']);
$entity['phrase.extra'] = array('sub','sup');
$entity['phrase.basic'] = array('em','strong','dfn','code','samp','kbd',
'var','cite','abbr','acronym','q');
$entity['phrase'] = array_merge($entity['phrase.extra'],
$entity['phrase.basic']);
$entity['misc.inline'] = array('ins','del');
$entity['misc'] = $entity['misc.inline'];
$entity['inline'] = array_merge(array('a'), $entity['special'],
$entity['fontstyle'], $entity['phrase']);
$entity['heading'] = array('h1','h2','h3','h4','h5','h6');
$entity['lists'] = array('ul','ol', 'dl');
$entity['blocktext'] = array('pre','hr','blockquote','address');
$entity['block'] = array_merge(array('p','div','table'),
$entity['heading'],$entity['lists'], $entity['blocktext']);
$entity['Inline'] = array_merge(array('#PCDATA'),$entity['special'],
$entity['misc.inline']);
$entity['Flow'] = array_merge(array('#PCDATA'), $entity['block'],
$entity['inline'], $entity['misc']);
$entity['a.content'] = array_merge(array('#PCDATA'), $entity['special'],
$entity['fontstyle'], $entity['phrase'], $entity['misc.inline']);
$entity['pre.content'] = array_merge(array('#PCDATA', 'a'),
$entity['special.basic'], $entity['fontstyle.basic'],
$entity['phrase.basic'], $entity['misc.inline']);
$this->info['ins'] =
$this->info['del'] =
$this->info['blockquote'] =
$this->info['dd'] =
$this->info['div'] = array($entity['Flow']);
$this->info['em'] =
$this->info['strong'] =
$this->info['dfn'] =
$this->info['code'] =
$this->info['samp'] =
$this->info['kbd'] =
$this->info['var'] =
$this->info['code'] =
$this->info['samp'] =
$this->info['kbd'] =
$this->info['var'] =
$this->info['cite'] =
$this->info['abbr'] =
$this->info['acronym'] =
$this->info['q'] =
$this->info['sub'] =
$this->info['tt'] =
$this->info['sup'] =
$this->info['i'] =
$this->info['b'] =
$this->info['big'] =
$this->info['small'] =
$this->info['u'] =
$this->info['s'] =
$this->info['strike'] =
$this->info['bdo'] =
$this->info['span']=
$this->info['dt'] =
$this->info['p'] =
$this->info['h1'] =
$this->info['h2'] =
$this->info['h3'] =
$this->info['h4'] =
$this->info['h5'] =
$this->info['h6'] = array($entity['Inline']);
$this->info['ol'] =
$this->info['ul'] = array(array('li'),array(),'+');
$this->info['dl'] = array(array('dt','dd'));
$this->info['address'] = array(array_merge(array('#PCDATA', 'p'),
$entity['inline'], $entity['misc.inline']));
$this->info['img'] =
$this->info['br'] =
$this->info['hr'] = 'EMPTY';
$this->info['pre'] = array($entity['pre.content']);
$this->info['a'] = array($entity['a.content']);
}
function makeTokensWellFormed($tokens) {
$opened_tags = array();
$k = 0;
$array = array();
$size = count($tokens);
for ($i = $j = 0; $i < $size; $i++, $j++) {
if (!is_a($tokens[$i], 'HTML_Text') && !is_a($tokens[$i], 'HTML_Comment')) {
$tokens[$i]->selfToLower();
}
if (is_a($tokens[$i], 'HTML_StartTag')
|| is_a($tokens[$i], 'HTML_EmptyTag')
|| is_a($tokens[$i], 'HTML_Text')) {
if($k > 0) {
$parent = $opened_tags[$k - 1];
} else {
$parent = '#ROOT';
}
if (!$this->isTagAllowed($tokens[$i], $parent)) {
$j--;
continue;
}
if (is_a($tokens[$i], 'HTML_StartTag')) {
$opened_tags[$k] = $tokens[$i];
$k++;
}
} elseif (is_a($tokens[$i], 'HTML_EndTag')) {
if (count($opened_tags) == 0 ||
$opened_tags[$k-1]->getType() != $tokens[$i]->getType()) {
$j--;
continue;
}
unset($opened_tags[$k-1]);
$k--;
}
$array[$j] = $tokens[$i];
}
$size = count($opened_tags);
for($i = $size - 1; $i >= 0; $i--) {
$array[] = new HTML_EndTag($opened_tags[$i]->getType());
}
return $array;
}
function isTagAllowed($tag, $parent) {
if ($parent === '#ROOT' || $parent === null) {
$parent = new HTML_StartTag('div');
}
$parent_type = $parent->getType();
$type = $tag->getType();
// check if is real tag
if ($type !== '#PCDATA' && !isset($this->info[$type])) return false;
$parent_info = $this->info[$parent_type];
return in_array($type, $parent_info[0]) &&
(empty($parent_info[1]) || !in_array($type, $parent_info[1]));
}
}
?>Code: Select all
<?php
class SimpleTest_MarkupLexer extends UnitTestCase
{
var $MarkupLexer;
function setUp() {
$this->MarkupLexer =& new MarkupLexer();
}
function test_convertShorthand() {
$test_array = array(
array('The dog--the cat.', 'The dog–the cat.')
,array('Perhaps… it was fate!', 'Perhaps... it was fate!')
,array('He said, “What?”', 'He said, "What?"')
,array('‘Camelot’', '\'Camelot\'')
);
foreach ($test_array as $input_expect) {
$input = $input_expect[0];
$expect = $input_expect[1];
$result = $this->MarkupLexer->convertShortHand($input);
$this->assertEqual($expect, $result);
}
}
function test_nextWhiteSpace() {
$HP =& $this->MarkupLexer;
$this->assertIdentical(false, $HP->nextWhiteSpace('asdf'));
$this->assertIdentical(0, $HP->nextWhiteSpace(' asdf'));
$this->assertIdentical(0, $HP->nextWhiteSpace("\nasdf"));
$this->assertIdentical(1, $HP->nextWhiteSpace("a\tsdf"));
$this->assertIdentical(4, $HP->nextWhiteSpace("asdf\r"));
$this->assertIdentical(2, $HP->nextWhiteSpace("as\t\r\nasdf as"));
}
function test_tokenizeHTML() {
$input[] = '';
$expect[] = array();
$input[] = 'This is regular text.';
$expect[] = array(
new HTML_Text('This is regular text.')
);
$input[] = 'This is <b>bold</b> text';
$expect[] = array(
new HTML_Text('This is ')
,new HTML_StartTag('b', array())
,new HTML_Text('bold')
,new HTML_EndTag('b')
,new HTML_Text(' text')
);
$input[] = '<DIV>Totally rad dude. <b>asdf</b></div>';
$expect[] = array(
new HTML_StartTag('DIV', array())
,new HTML_Text('Totally rad dude. ')
,new HTML_StartTag('b', array())
,new HTML_Text('asdf')
,new HTML_EndTag('b')
,new HTML_EndTag('div')
);
$input[] = '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>';
$expect[] = array(
new HTML_StartTag('asdf')
,new HTML_EndTag('asdf')
,new HTML_StartTag('d')
,new HTML_EndTag('d')
,new HTML_StartTag('poOloka')
,new HTML_StartTag('poolasdf')
,new HTML_StartTag('ds')
,new HTML_EndTag('asdf')
,new HTML_EndTag('ASDF')
);
$input[] = '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>';
$expect[] = array(
new HTML_StartTag('a',array('href'=>'foobar.php','title'=>'foo!'))
,new HTML_Text('Link to ')
,new HTML_StartTag('b',array('id'=>'asdf'))
,new HTML_Text('foobar')
,new HTML_EndTag('b')
,new HTML_EndTag('a')
);
$input[] = '<br />';
$expect[] = array(
new HTML_EmptyTag('br')
);
$input[] = '<!-- Comment --> <!-- not so well formed --->';
$expect[] = array(
new HTML_Comment(' Comment ')
,new HTML_Text(' ')
,new HTML_Comment(' not so well formed -')
);
$input[] = '<a href=""';
$expect[] = array(
new HTML_Text('<a href=""')
);
$size = count($input);
for($i = 0; $i < $size; $i++) {
$result = $this->MarkupLexer->tokenizeHTML($input[$i]);
$this->assertEqual($expect[$i], $result);
paintIf($result, $expect[$i] != $result);
}
}
function test_tokenizeAttributeString() {
$input[] = 'href="asdf" boom="assdf"';
$expect[] = array('href'=>'asdf', 'boom'=>'assdf');
$input[] = "href='r'";
$expect[] = array('href'=>'r');
$input[] = 'onclick="javascript:alert(\'asdf\');"';
$expect[] = array('onclick' => "javascript:alert('asdf');");
$input[] = 'selected';
$expect[] = array('selected'=>'selected');
$input[] = '="asdf"';
$expect[] = array();
$size = count($input);
for($i = 0; $i < $size; $i++) {
$result = $this->MarkupLexer->tokenizeAttributeString($input[$i]);
$this->assertEqual($expect[$i], $result);
paintIf($result, $expect[$i] != $result);
}
}
}
?>Code: Select all
<?php
class SimpleTest_DTD_XHTML_1_0_Simplified extends UnitTestCase
{
function test_makeTokensWellFormed() {
$DTD =& new DTD_XHTML_1_0_Simplified();
$input[] = array(
new HTML_StartTag('b')
,new HTML_Text('Bold text')
);
$expect[] = array(
new HTML_StartTag('b')
,new HTML_Text('Bold text')
,new HTML_EndTag('b')
);
$input[] = array(
new HTML_Text('Bold text')
,new HTML_EndTag('b')
,new HTML_Text('asdf')
);
$expect[] = array(
new HTML_Text('Bold text')
,new HTML_Text('asdf')
);
$input[] = array(
new HTML_StartTag('b')
,new HTML_StartTag('div')
,new HTML_EndTag('div')
,new HTML_EndTag('b')
);
$expect[] = array(
new HTML_StartTag('b')
,new HTML_EndTag('b')
);
$MarkupLexer =& new MarkupLexer();
$tokens = $MarkupLexer->tokenizeHTML(
'<html><body><h1>A title</h1><p>Let us begin</p>
<p>asdf<p>asdf</p></p><span><div>asdf</div></span></body></html>'
);
$tokens = $DTD->makeTokensWellFormed($tokens);
//var_dump($tokens);
/*
$input[] = array(
);
$expect[] = array(
);
*/
$size = count($input);
for($i = 0; $i < $size; $i++) {
$result = $DTD->makeTokensWellFormed($input[$i]);
$this->assertEqual($expect[$i],$result);
paintIf($result, $expect[$i] != $result);
}
}
}
?>What do you think?
Edit
Forgot a few more classes...
Code: Select all
<?php
class HTML_Comment
{
var $value;
function HTML_Comment($value) {
$this->value = $value;
}
}
?>Code: Select all
<?php
class HTML_Tag
{
var $type;
function HTML_Tag($type) {
$this->type = $type;
}
function selfToLower() {
$this->type = strtolower($this->type);
}
function getType() {
return $this->type;
}
}
class HTML_StartTag extends HTML_Tag
{
var $attributes = array();
function HTML_StartTag($type, $attributes = array()) {
$this->HTML_Tag($type);
$this->attributes = $attributes;
}
}
class HTML_EmptyTag extends HTML_StartTag
{
}
class HTML_EndTag extends HTML_Tag
{
}
?>Code: Select all
<?php
class HTML_Text
{
var $value;
function HTML_Text($value) {
$this->value = $value;
}
function getType() {return '#PCDATA';}
function getValue() {return $this->value;}
function addToValue($string) {$this->value .= $string;}
}
?>