Page 1 of 1

Find Closing tag in nested html tags

Posted: Sat Mar 11, 2006 3:35 am
by Mehr
<div class="main">
<div class="d1a">
d1a text
</div>
<div class="d1b">
d1b text
</div>
</div>

<div class="main">
<div class="d2a">
d2a text
</div>
<div class="d2b">
d2b text
</div>
</div>

Hello,
I wanted to write a regex to match specific tags with its nested tags. In the above example, you can see there is two "main" class divs which eachone has some nested divs. I need to find the closing tag of each "main" class divs. I tried the following patterns but they didn't work:
/(<div[^>]*class\=(\"|\')Sense(\"|\')[^>]*>(<div[^>]*>.*?<\/div>)*.*?<\/div>)/is

/(<div[^>]*class\=(\"|\')Sense(\"|\')[^>]*>.*?(<div[^>]*>[^(<div)]*<\/div>.*?)*.*?<\/div>)/is

<div.*?>.*?<\/div>

/(<div[^>]*class\=(\"|\')maindiv(\"|\')[^>]*>.*?(<div.*?(\4|<\/div>)*.*?)*<\/div>)/is

/(<div[^>]*>)?(?(1).*?<\/div>)*/is

/(<div[^>]*>)?(?(1).*?<\/div>|\1)*/is

!((<div[^>]*>|</div>)*.*?(<div[^>]*>|</div>)*)!is

Every solution will be appreciated.
Thanks a lot.

Posted: Sat Mar 11, 2006 4:28 am
by Chris Corbyn
You need to use a tokenizer to do that. Perhaps looking over the BBCode parsing functions in the phpBB source would be a help ;)

Here's a simple BBCode parser I wrote though.... sorry, you'll need to suss it out from this but basically, look at the method called "tokenize()" and then look at any of the other methods such as parseQuotes().

Code: Select all

<?php

/*
 A custom BBCode handler with support
 for plugins for new BBCode tags.
 Chris Corbyn (Jan 2006)
 */

class BBCode
{
	private
		$source,
		$output,
		$tokens = array(),
		$tags = array(
			'b' => '\[b\]',
			'u' => '\[u\]',
			'i' => '\[i\]',
			'size' => '\[size=([^\]]+)\]',
			'color' => '\[color=([^\]]+)\]',
			'img' => '\[img\]',
			'url' => '\[url=([^\]]+)\]',
			'quote' => '\[quote(?:="([^"]+)")?\]',
			'code' => '\[code(?:=([^\]]+))?\]',
			'list' => '\[list(?:=([^\]]+))?\]',
			'smilies' => ':dense:|:drunk:|:\)|:-\)|:smile:|;\)|;-\)|:wink:|:p|:P|:-p|:-P|:\(|:-\(|:sad:|:razz:|:\?|:confused:|:\||:-\||:blank:|:nod:|:shake:|:reading:|:cool:|:D|:-D|:grin:|:hit:|:smart:|:laugh:|:lol:|:thumbsup:|:thumbsdown:|:cry:|:wavecry:|:arrow:|:angry:|:woot:|:love:|:rolleyes:|:unsure:|:angel:|:clap:|:buzz:|:shocked:|:food:|:sleep:|:smirk:|:wiz:|:wizard:'
		);
		private $smiliePath = 'sys/img/smilies';
	
	function __construct($input)
	{
		$this->setSource($input);
	}

	//Tag name => RegExp
	public function setBBTag($tag, $eval)
	{
		$this->tags[$tag] = $eval;
	}
	
	protected function setSource($source)
	{
		$this->source = $source;
	}
	
	//Recursive
	protected function tokenize($tag, $text=false, $ignore=array(), $stack=array(), $recursing=false)
	{
		if (!$text && !$recursing) $text = $this->source;
		elseif (!$text && $recursing) return $stack; //Nothing left to do
		
		$block = '';
		foreach ($ignore as $t) $block .= $this->tags[$t].'.*?\[/'.$t.'\]|';
		
		$re = '@'.$block.$this->tags[$tag].'|\[/'.$tag.'\]@is'; //Opening tag or closing tag
		
		if (preg_match($re, $text, $matches, PREG_OFFSET_CAPTURE))
		{
			$chunks = $matches[0];
			$offset = 0;
			if ($chunks[1] > 0)
			{
				$offset = $chunks[1]; //Preg offset (substr)
				$plain_text = substr($text, 0, $offset);
				$stack[] = $plain_text; //Text before the tag
			}
			$stack[] = $chunks[0]; //The actual tag
			$text = substr($text, (strlen($chunks[0])+$offset)); //Drop chunk off the string since already processed
			return $this->tokenize($tag, $text, $ignore, $stack, 1); //Recurse
		}
		else
		{
			$stack[] = $text; //No match, nothing left to do
			return $stack;
		}
	}

	//Nothing more than a cascade through all the handlers
	public function parseComplete($source='')
	{
		if (empty($source)) $source = $this->source;
		$source = $this->parseSmilies($source);
		$source = $this->parseBold($source);
		$source = $this->parseItalic($source);
		$source = $this->parseUnderline($source);
		$source = $this->parseQuotes($source);
		$source = $this->parseUrl($source);
		$source = $this->parseCode($source);
		$this->output = $source;
		return $this->output;
	}

	public function parseSmilies($source='')
	{
		if (empty($source)) $source = $this->source;
		$this->tokens = $this->tokenize('smilies', $source); //Fetch tokens
		$ret = '';
		foreach ($this->tokens as $tok)
		{
			switch ($tok)
			{
				case ':)':
				case ':-)':
				case ':smile:':
				$ret .= '<img src="'.$this->smiliePath.'/smile1.gif" alt="smile" />';
				break;
				case ';)':
				case ';-)':
				case ':wink:':
				$ret .= '<img src="'.$this->smiliePath.'/wink.gif" alt="wink" />';
				break;
				case ':p':
				case ':P':
				case ':-p':
				case ':-P':
				case ':razz:':
				$ret .= '<img src="'.$this->smiliePath.'/tongue.gif" alt="razz" />';
				break;
				case ':(':
				case ':-(':
				case ':sad:':
				$ret .= '<img src="'.$this->smiliePath.'/sad.gif" alt="sad" />';
				break;
				case ':nod:':
				$ret .= '<img src="'.$this->smiliePath.'/yes.gif" alt="yes" />';
				break;
				case ':shake:':
				$ret .= '<img src="'.$this->smiliePath.'/no.gif" alt="no" />';
				break;
				case ':reading:':
				$ret .= '<img src="'.$this->smiliePath.'/coffee.gif" alt="coffee" />';
				break;
				case ':cool:':
				$ret .= '<img src="'.$this->smiliePath.'/cool2.gif" alt="cool" />';
				break;
				case ':D':
				case ':-D':
				case ':grin:':
				$ret .= '<img src="'.$this->smiliePath.'/grin.gif" alt="grin" />';
				break;
				case ':laugh:':
				case ':lol:':
				$ret .= '<img src="'.$this->smiliePath.'/laugh.gif" alt="laugh" />';
				break;
				case ':hit:':
				$ret .= '<img src="'.$this->smiliePath.'/hit.gif" alt="wollop" />';
				break;
				case ':thumbsup:':
				$ret .= '<img src="'.$this->smiliePath.'/thumbsup.gif" alt="thumbs up" />';
				break;
				case ':thumbsdown:':
				$ret .= '<img src="'.$this->smiliePath.'/thumbsdown.gif" alt="thumbs down" />';
				break;
				case ':dense:':
				$ret .= '<img src="'.$this->smiliePath.'/dense.gif" alt="dense" />';
				break;
				case ':smart:':
				$ret .= '<img src="'.$this->smiliePath.'/smartass.gif" alt="smart" />';
				break;
				case ':?':
				case ':confused:':
				$ret .= '<img src="'.$this->smiliePath.'/huh.gif" alt="confused" />';
				break;
				case ':arrow:':
				$ret .= '<img src="'.$this->smiliePath.'/arrow.gif" alt="arrow" />';
				break;
				case ':cry:':
				$ret .= '<img src="'.$this->smiliePath.'/cry.gif" alt="cry" />';
				break;
				case ':wavecry:':
				$ret .= '<img src="'.$this->smiliePath.'/wavecry.gif" alt="waving crying" />';
				break;
				case ':angry:':
				$ret .= '<img src="'.$this->smiliePath.'/wag.gif" alt="angry" />';
				break;
				case ':love:':
				$ret .= '<img src="'.$this->smiliePath.'/wub.gif" alt="lovey dovey" />';
				break;
				case ':woot:':
				$ret .= '<img src="'.$this->smiliePath.'/w00t.gif" alt="woot!" />';
				break;
				case ':rolleyes:':
				$ret .= '<img src="'.$this->smiliePath.'/rolleyes.gif" alt="rolling eyes" />';
				break;
				case ':unsure:':
				$ret .= '<img src="'.$this->smiliePath.'/hmmm.gif" alt="hmmm" />';
				break;
				case ':angel:':
				$ret .= '<img src="'.$this->smiliePath.'/angel.gif" alt="angel" />';
				break;
				case ':clap:':
				$ret .= '<img src="'.$this->smiliePath.'/clap2.gif" alt="clapping" />';
				break;
				case ':drunk:':
				$ret .= '<img src="'.$this->smiliePath.'/drunk.gif" alt="drunk" />';
				break;
				case ':buzz:':
				$ret .= '<img src="'.$this->smiliePath.'/mml.gif" alt="buzzing" />';
				break;
				case ':|':
				case ':-|':
				case ':blank:':
				$ret .= '<img src="'.$this->smiliePath.'/noexpression.gif" alt="blank" />';
				break;
				case ':shocked:':
				$ret .= '<img src="'.$this->smiliePath.'/ohmy.gif" alt="shocked" />';
				break;
				case ':food:':
				$ret .= '<img src="'.$this->smiliePath.'/pizza.gif" alt="pizza" />';
				break;
				case ':sleep:':
				$ret .= '<img src="'.$this->smiliePath.'/sleeping.gif" alt="sleeping" />';
				break;
				case ':smirk:':
				$ret .= '<img src="'.$this->smiliePath.'/smirk.gif" alt="smirk" />';
				break;
				case ':wiz:':
				case ':wizard:':
				$ret .= '<img src="'.$this->smiliePath.'/wizard.gif" alt="wizard" />';
				break;
				default: $ret .= $tok;
			}
		}
		return $ret;
	}

	public function parseUrl($source='')
	{
		if (empty($source)) $source = $this->source;
		$this->tokens = $this->tokenize('url', $source); //Fetch tokens
		if (!$this->checkClosure('url'))
		{
			$this->output = $source;
			return $this->output;
		}
		$ret = '';
		foreach ($this->tokens as $tok)
		{
			if (preg_match('@'.$this->tags['url'].'@is', $tok, $matches))
			{ //Opening quote
				$href = $this->makeAbsoluteUrl($matches[1]);
				$ret .= '<a href="'.$href.'" target="_blank">';
			}
			elseif (preg_match('@\[/url\]@is', $tok, $matches))
			{
				$ret .= '</a>'; //Close the quote box
			}
			else
			{
				$ret .= $tok; //Insert the text
			}
		}
		$this->output = $ret;
		return $this->output;
	}

	private function checkClosure($tag)
	{
		$opened = 0;
		$closed = 0;
		foreach ($this->tokens as $tok)
		{
			if (preg_match('@'.$this->tags[$tag].'@is', $tok, $matches))
				$opened++;
			elseif (preg_match('@\[/'.$tag.'\]@is', $tok, $matches))
				$closed++;
		}
		if ($opened === $closed) return true;
	}

	private function makeAbsoluteUrl($url)
	{
		if (!preg_match('@^[a-z]+://@i', $url)) $url = 'http://'.$url;
		return $url;
	}
	
	public function parseQuotes($source='')
	{
		if (empty($source)) $source = $this->source;
		$this->tokens = $this->tokenize('quote', $source, array('code')); //Fetch tokens
		if (!$this->checkClosure('quote'))
		{
			$this->output = $source;
			return $this->output;
		}
		$ret = '';
		foreach ($this->tokens as $tok)
		{
			if (preg_match('@'.$this->tags['quote'].'@is', $tok, $matches))
			{ //Opening quote
				$info = 'Quote';
				if (!empty($matches[1])) $info = $matches[1].' wrote'; //Name parameter given
				$ret .= '<div style="font-style: italic; border: 1px dotted #777777; background: #FFFFF8; padding: 4px; margin: 4px;">
				<div style="font-weight: bold; font-style: normal;">'.$info.':</div>';
			}
			elseif (preg_match('@\[/quote\]@is', $tok, $matches))
			{
				$ret .= '</div>'; //Close the quote box
			}
			else
			{
				$ret .= $tok; //Insert the text
			}
		}
		$this->output = $ret;
		return $this->output;
	}

	public function parseBold($source='')
	{
		if (empty($source)) $source = $this->source;
		$this->tokens = $this->tokenize('b', $source, array('code')); //Fetch tokens
		$ret = '';
		if (!$this->checkClosure('b'))
		{
			$this->output = $source;
			return $this->output;
		}
		foreach ($this->tokens as $tok)
		{
			if (preg_match('@'.$this->tags['b'].'@is', $tok, $matches))
			{ //Opening tag
				$ret .= '<strong>';
			}
			elseif (preg_match('@\[/b\]@is', $tok, $matches))
			{
				$ret .= '</strong>'; //Close the tag
			}
			else
			{
				$ret .= $tok; //Insert the text
			}
		}
		$this->output = $ret;
		return $this->output;
	}

	public function parseItalic($source='')
	{
		if (empty($source)) $source = $this->source;
		$this->tokens = $this->tokenize('i', $source, array('code')); //Fetch tokens
		$ret = '';
		if (!$this->checkClosure('i'))
		{
			$this->output = $source;
			return $this->output;
		}
		foreach ($this->tokens as $tok)
		{
			if (preg_match('@'.$this->tags['i'].'@is', $tok, $matches))
			{ //Opening tag
				$ret .= '<em>';
			}
			elseif (preg_match('@\[/i\]@is', $tok, $matches))
			{
				$ret .= '</em>'; //Close the tag
			}
			else
			{
				$ret .= $tok; //Insert the text
			}
		}
		$this->output = $ret;
		return $this->output;
	}

	public function parseUnderline($source='')
	{
		if (empty($source)) $source = $this->source;
		$this->tokens = $this->tokenize('u', $source, array('code')); //Fetch tokens
		$ret = '';
		if (!$this->checkClosure('u'))
		{
			$this->output = $source;
			return $this->output;
		}
		foreach ($this->tokens as $tok)
		{
			if (preg_match('@'.$this->tags['u'].'@is', $tok, $matches))
			{ //Opening tag
				$ret .= '<u>';
			}
			elseif (preg_match('@\[/u\]@is', $tok, $matches))
			{
				$ret .= '</u>'; //Close the tag
			}
			else
			{
				$ret .= $tok; //Insert the text
			}
		}
		$this->output = $ret;
		return $this->output;
	}

	public function parseCode($source='')
	{
		if (empty($source)) $source = $this->source;
		$this->tokens = $this->tokenize('code', $source); //Fetch tokens
		$ret = '';
		if (!$this->checkClosure('code'))
		{
			$this->output = $source;
			return $this->output;
		}
		$type = false;
		$last = 0;
		$i = 0;
		foreach ($this->tokens as $tok)
		{
			$i++;
			if (preg_match('@'.$this->tags['code'].'@is', $tok, $matches))
			{ //Opening code block
				if (!empty($matches[1]))
				{
					switch (strtolower($matches[1]))
					{
						case 'javascript':
						case 'js':
						$type = 'js';
						break;
						default: $type = false;
					}
					$last = $i;
				}
				else $type = false;
				$ret .= '<div style=" border: 1px solid #AAAAAA; padding: 4px; margin: 4px; background: #FFFFFF;">
				<code style="white-space: pre; font-family: courier,monospace; color: #007700;">';
			}
			elseif (preg_match('@\[/code\]@is', $tok, $matches))
			{
				$ret .= '</code></div>'; //Close the code box
			}
			else
			{
				switch ($type)
				{
					case 'js':
					$js = new JSHighlight(str_replace('<br />', '', $tok));
					if ($last == $i-1) $tok = $js->Generate(1);
					break;
				}
				$ret .= $tok; //Insert the text
			}
		}
		$this->output = $ret;
		return $this->output;
	}
	
	//Just for debugging
	public function dumpTokens()
	{
		echo '<pre>'.print_r($this->tokens, 1).'</pre>';
	}
	
	//Internal
	protected function getOutput()
	{
		return $this->output;
	}
	
	public function fetchResult()
	{
		return $this->getOutput();
	}
}

?>
Hmm... I think I must have removed the plugin support at some point.

Thanks

Posted: Sun Mar 12, 2006 11:05 am
by Mehr
Thank you for your guidance and code, I'm going to use a same method to match closing tags, Regex isn't alone a ultimate solution when dialing with nested tags.
But Someone suggested a wonderful regex that really works with one level deep nested tags. I've called it a "black magic". Try it yourself:
http://regexadvice.com/forums/15720/ShowPost.aspx [/url]

Posted: Sat Mar 18, 2006 1:04 pm
by Mehr
I have written this PHP function to match specific div tags in a page(No problem with nesting):

Code: Select all

function get_senses()
{
	global $subject;
	preg_match_all("/<div[^>]+class\=\"Sense\">/is", $subject, $pos_senses, PREG_OFFSET_CAPTURE);
	$pos_senses = $pos_senses[0];
	for($i=0; $i<count($pos_senses); $i++)
	{
		$dopen = 0;
		$dclose = 0;
		$div_close_pos = 0;
		$pos_divs = array();
		preg_match_all("/<\/?div[^>]*>/is", $subject, $pos_divs, PREG_OFFSET_CAPTURE, $pos_senses[$i][1]);
		$pos_divs = $pos_divs[0];//parr($pos_divs);
		for($i2=0; $i2<count($pos_divs); $i2++)
		{
			if(eregi("\/", $pos_divs[$i2][0]))
			{
				$dclose++;// echo "Dclose: $dclose";
			}
			else
			{
				$dopen++;// echo " | Dopen: $dopen";
			}
			///////////////////
			if($dclose > $dopen-1)
			{
				$div_close_pos = $pos_divs[$i2][1];
				$div_len = $div_close_pos+6 - $pos_senses[$i][1];// echo $div_len+6;
				//echo " Sp".$pos_senses[$i][1] . "_" . $div_close_pos."Ep | ";
				//parr( substr($subject, $pos_senses[$i][1], $div_len));echo "<hr>";
				$senses[$i] = substr($subject, $pos_senses[$i][1], $div_len);
				break;
			}
		}
	}
	return $senses;
}