You'll end up building a complete parser and implementing the DTD. No, you can't use XML parsing because people generally don't follow the XML guidelines of wellformedness and plus HTML has several caveats that cannot be handled via a regular parser.
My suggestion is to simply handle inline tags: they're fairly simple and are unlikely to break the layout. If you want to expand support to almost all tags... well... I started something... here's a sample:
Code: Select all
//////////////
//Tag Grouping
$heading = array('h1','h2','h3','h4','h5','h6');
$list = array('ul','ol');
$preformatted = array('pre');
$fontstyle = array('tt','i','b','big','small');
$phrase = array('em','strong','dfn','code','samp','kbd','var','cite','abbr','acronym');
$special = array('a','img','br','q','sub','sup','span','bdo','object','script','map');
$formctrl = array('input','select','textarea','label','button');
$inline = array_merge(array('PCDATA'), $fontstyle, $phrase,$special, $formctrl);
$block = array_merge(array('p','dl','div','blockquote','hr','table','address'
,'noscript','form','fieldset'),$heading,$list,$preformatted);
$flow = array_merge($block, $inline);
$preExclusion = array('img','object','big','small','sub','sup');
/////////////////////////
//Container Type Grouping
$e_ListClass = array('CDATA?ListClass');
$e_ListURI = array('CDATA?ListURI');
$e_NamedLinkEnd = array('CDATA?NamedLinkEnd');
$e_ReferenceUsemap = array('CDATA?ReferenceUsemap');
$e_PropertyValue = array('CDATA?PropertyValue');
$HTMLFrameset = array('IGNORE');
$ContentType = array('CDATA?ContentType');
$ContentTypes = array('CDATA?ContentTypes');
$Charset = array('CDATA?Charset');
$Chrasets = array('CDATA?Charsets');
$LanguageCode = array('NAME?LanguageCode');
$Character = array('CDATA?Character');
$LinkTypes = array('CDATA?LinkTypes');
$MediaDesc = array('CDATA?MediaDesc');
$URI = array('CDATA?URI');
$Datetime = array('CDATA?Datetime');
$Script = array('CDATA?Script');
$StyleSheet = array('CDATA?StyleSheet');
$Text = array('CDATA?Text');
$headMisc = array('script','style','meta','link','object');
$Shape = array('rect','circle','poly','default');
$Coords = array('CDATA?Coords');
$Length = array('CDATA?Length');
$MultiLength = array('CDATA?MultiLength');
$MultiLengths = array('CDATA?MultiLengths');
$Pixels = array('CDATA?Pixels');
////////////
//Attributes
$coreattrs = array(
'id' => array('type'=>array('ID'),'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
'class' => array('type'=>$e_ListClass,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
'style' => array('type'=>$StyleSheet,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
'title' => array('type'=>$Text,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED)
);
$i18n = array(
'lang' => array('type'=>$LanguageCode,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
'dir' => array('type'=>array('ltr','rtl'),'default'=>TWP_DTD_ATTRIBUTE_IMPLIED)
);
$events = array (
'onclick' => array('type'=>$Script,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
'ondblclick' => array('type'=>$Script,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
'onmousedown' => array('type'=>$Script,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
'onmouseup' => array('type'=>$Script,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
'onmouseover' => array('type'=>$Script,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
'onmousemove' => array('type'=>$Script,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
'onmouseout' => array('type'=>$Script,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
'onkeypress' => array('type'=>$Script,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
'onkeydown' => array('type'=>$Script,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
'onkeyup' => array('type'=>$Script,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED)
);
$reserved = array();
$attrs = array_merge($coreattrs,$i18n,$events);
/////////////////////////
//User Defined Attributes
$a_accesskey= array('accesskey'=>array('type'=>$Character,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED));
$a_archive = array('archive'=>array('type'=>$e_ListURI,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED));
$a_charset = array('charset'=>array('type'=>$Charset,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED));
$a_coords = array('coords'=>array('type'=>$Coords,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED));
$a_classid = array('classid'=>array('type'=>$URI,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED));
$a_codebase = array('codebase'=>array('type'=>$URI,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED));
$a_codetype = array('codetype'=>array('type'=>$ContentType,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED));
$a_cite = array('cite'=>array('type'=>$URI,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED));
$a_data = array('data'=>array('type'=>$URI,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED));
$a_datetime = array('datatime'=>array('type'=>$Datetime,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED));
Crazy isn't it...