Page 2 of 2

Posted: Sat Aug 20, 2005 2:54 am
by AnarKy
AGISB wrote:Also make sure you remember which of the tags were openend and close them if they are not closed.
What is the best way to do this? I can imagine that this could get quite messy? :?
Is there an easy way to determine where to close the tag?

Posted: Sat Aug 20, 2005 3:05 am
by feyd
basically, it'd require analyzing the object stack, which isn't all that hard.

Posted: Sun Aug 21, 2005 1:59 pm
by Ambush Commander
You'll end up building a complete parser and implementing the DTD. No, you can't use XML parsing because people generally don't follow the XML guidelines of wellformedness and plus HTML has several caveats that cannot be handled via a regular parser.

My suggestion is to simply handle inline tags: they're fairly simple and are unlikely to break the layout. If you want to expand support to almost all tags... well... I started something... here's a sample:

Code: Select all

//////////////
        //Tag Grouping
        $heading = array('h1','h2','h3','h4','h5','h6');
        $list = array('ul','ol');
        $preformatted = array('pre');
        $fontstyle = array('tt','i','b','big','small');
        $phrase = array('em','strong','dfn','code','samp','kbd','var','cite','abbr','acronym');
        $special = array('a','img','br','q','sub','sup','span','bdo','object','script','map');
        $formctrl = array('input','select','textarea','label','button');
        $inline = array_merge(array('PCDATA'), $fontstyle, $phrase,$special, $formctrl);
        $block = array_merge(array('p','dl','div','blockquote','hr','table','address'
        ,'noscript','form','fieldset'),$heading,$list,$preformatted);
        $flow = array_merge($block, $inline);
        $preExclusion = array('img','object','big','small','sub','sup');
        
        /////////////////////////
        //Container Type Grouping
        $e_ListClass = array('CDATA?ListClass');
        $e_ListURI = array('CDATA?ListURI');
        $e_NamedLinkEnd = array('CDATA?NamedLinkEnd');
        $e_ReferenceUsemap = array('CDATA?ReferenceUsemap');
        $e_PropertyValue = array('CDATA?PropertyValue');
        $HTMLFrameset = array('IGNORE');
        $ContentType = array('CDATA?ContentType');
        $ContentTypes = array('CDATA?ContentTypes');
        $Charset = array('CDATA?Charset');
        $Chrasets = array('CDATA?Charsets');
        $LanguageCode = array('NAME?LanguageCode');
        $Character = array('CDATA?Character');
        $LinkTypes = array('CDATA?LinkTypes');
        $MediaDesc = array('CDATA?MediaDesc');
        $URI = array('CDATA?URI');
        $Datetime = array('CDATA?Datetime');
        $Script = array('CDATA?Script');
        $StyleSheet = array('CDATA?StyleSheet');
        $Text = array('CDATA?Text');
        $headMisc = array('script','style','meta','link','object');
        $Shape = array('rect','circle','poly','default');
        $Coords = array('CDATA?Coords');
        $Length = array('CDATA?Length');
        $MultiLength = array('CDATA?MultiLength');
        $MultiLengths = array('CDATA?MultiLengths');
        $Pixels = array('CDATA?Pixels');
        
        ////////////
        //Attributes
        $coreattrs = array(
            'id' => array('type'=>array('ID'),'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
            'class' => array('type'=>$e_ListClass,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
            'style' => array('type'=>$StyleSheet,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
            'title' => array('type'=>$Text,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED)
        );
        
        $i18n = array(
            'lang' => array('type'=>$LanguageCode,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
            'dir' => array('type'=>array('ltr','rtl'),'default'=>TWP_DTD_ATTRIBUTE_IMPLIED)
        );
        
        $events = array (
            'onclick' => array('type'=>$Script,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
            'ondblclick' => array('type'=>$Script,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
            'onmousedown' => array('type'=>$Script,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
            'onmouseup' => array('type'=>$Script,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
            'onmouseover' => array('type'=>$Script,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
            'onmousemove' => array('type'=>$Script,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
            'onmouseout' => array('type'=>$Script,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
            'onkeypress' => array('type'=>$Script,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
            'onkeydown' => array('type'=>$Script,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED),
            'onkeyup' => array('type'=>$Script,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED)
        );
        
        $reserved = array();
        $attrs = array_merge($coreattrs,$i18n,$events);
        
        
        
        /////////////////////////
        //User Defined Attributes
        $a_accesskey= array('accesskey'=>array('type'=>$Character,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED));
        $a_archive  = array('archive'=>array('type'=>$e_ListURI,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED));
        $a_charset  = array('charset'=>array('type'=>$Charset,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED));
        $a_coords   = array('coords'=>array('type'=>$Coords,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED));
        $a_classid  = array('classid'=>array('type'=>$URI,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED));
        $a_codebase = array('codebase'=>array('type'=>$URI,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED));
        $a_codetype = array('codetype'=>array('type'=>$ContentType,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED));
        $a_cite     = array('cite'=>array('type'=>$URI,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED));
        $a_data     = array('data'=>array('type'=>$URI,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED));
        $a_datetime = array('datatime'=>array('type'=>$Datetime,'default'=>TWP_DTD_ATTRIBUTE_IMPLIED));
Crazy isn't it...