I wrote this a long time ago as a wiki-based parser. It's not exactly perfect, but it works fairly well.
Code: Select all
<?php
/*
parse a given string into token keys
returns an array of the tokens and their contents
*/
function bbTokenize($text)
{
//$text = func_get_arg(0);
// used to store the final tree of the document
$tree = array();
// first, break out the singular entities
$elements = preg_split( '@((?si:<noop>.*?</noop>|<code>.*?</code>|<pre>.*?</pre>)|(?m:^-{3,}|^\+{1,6}\s+.*?$))@m', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE );
//print var_export($elements,true)."\n\n\n";
// pieces of the wonky elements
$complex = array(
'\/'=>'i',
'\-'=>'s',
'\='=>'tt',
'\_'=>'u',
'\^'=>'sup',
'\*'=>'b',
'\%'=>'sub');
// analyze each element for further processing
for($i = 0, $j = count($elements); $i < $j; $i++)
{
$element =& $elements[$i];
if(preg_match('@^<(noop|code|pre)>@i', $element, $match) and preg_match('@^<('.$match[1].')>(.*?)</'.$match[1].'>$@si',$element,$match))
{ // no processing area, preformatted text or code area
$tree[] = array('element'=>$match[1],'type'=>'start');
$tree[] = array('element'=>'done','content'=>(strcasecmp($match[1],'noop')==0 ? nl2br($match[2]) : $match[2]));
$tree[] = array('element'=>$match[1],'type'=>'stop');
}
elseif(preg_match('@^-{3,}@', $element))
{ // horizontal rule element
$tree[] = array('element'=>'hr');
//print "%%%%%%%%%%%%%%%%%%%%\nhr in\n%%%%%%%%%%%%%%%%%%%%\n";
}
elseif(preg_match('@^(\+{1,6})\s+(.*?)\s*$@s', $element, $match))
{ // heading element
$tree[] = array('element'=>'h'.strlen($match[1]),'type'=>'start');
$more = bbTokenize($match[2]);
array_splice($tree, count($tree), 0, $more);
$tree[] = array('element'=>'h'.strlen($match[1]),'type'=>'stop');
}
elseif(preg_match('@(?m:^)(?s:([#\*])\s.*?(?:\n\n|$))@',$element,$match))
{ // list nesting
$type = ($match[1] == '*' ? 'ul' : 'ol');
$tree[] = array('element'=>$type,'type'=>'start');
//$more = preg_split( '@(?m:^)(?s:([#\*])\s.*?(?:\n\n|$))@', $element, -1, PREG_SPLIT_DELIM_CAPTURE );
$more = preg_split( '@\s*^(\s*[#\*])\s+@m', $element, -1, PREG_SPLIT_DELIM_CAPTURE );
array_shift($more);
$last = 0;
$lastType = $type;
$end = array();
for($k = 0; $k < count($more); $k+=2)
{
$depth = strlen($more[$k])-1;
//var_dump($depth,$more[$k]);
$type = ($more[$k]{$depth} == '*' ? 'ul' : 'ol');
if($depth < $last)
{
array_splice($tree, count($tree), 0, $end);
$end = array();
$tree[] = array('element'=>$type,'type'=>'start');
array_unshift($end, array('element'=>$type,'type'=>'stop'));
$tree[] = array('element'=>'li','type'=>'start');
array_unshift($end, array('element'=>'li','type'=>'stop'));
$evenmore = bbTokenize($more[$k+1]);
array_splice($tree, count($tree), 0, $evenmore);
}
elseif($depth > $last)
{ // nested
$tree[] = array('element'=>$type,'type'=>'start');
array_unshift($end, array('element'=>$type,'type'=>'stop'));
$tree[] = array('element'=>'li','type'=>'start');
array_unshift($end, array('element'=>'li','type'=>'stop'));
$evenmore = bbTokenize($more[$k+1]);
array_splice($tree, count($tree), 0, $evenmore);
}
elseif($type != $lastType)
{
array_splice($tree, count($tree), 0, $end);
$end = array();
$tree[] = array('element'=>$type,'type'=>'start');
array_unshift($end, array('element'=>$type,'type'=>'stop'));
$tree[] = array('element'=>'li','type'=>'start');
array_unshift($end, array('element'=>'li','type'=>'stop'));
$evenmore = bbTokenize($more[$k+1]);
array_splice($tree, count($tree), 0, $evenmore);
}
else
{
$tree[] = array('element'=>'li','type'=>'start');
$evenmore = bbTokenize($more[$k+1]);
array_splice($tree, count($tree), 0, $evenmore);
$tree[] = array('element'=>'li','type'=>'stop');
}
}
array_splice($tree, count($tree), 0, $end);
//var_dump($more);
//array_splice($tree, count($tree), 0, $more);
$tree[] = array('element'=>$type,'type'=>'stop');
}
elseif(preg_match("@(((?:https?|ftps?)://)?(?:(?:25[0-5]|2[0-4][0-9]". "|[0-1]?[0-9]?[0-9])(?:\.(?:25[0-5]|2[0-4][0-9]|[0-1]?". "[0-9]?[0-9])){3}|(?:[a-z0-9-]{2,}\.)+[a-z]{2,})[^ \n\r\t\?\"'\|]*).*?(?:\|([^\]]+))?@i",$element))
{ // web links are available
//print "%%%%%%%%%%%%%%%%%%%%\nlinks in\n%%%%%%%%%%%%%%%%%%%%\n";
$more = preg_split( "@\[?((?:(?:https?|ftps?)://)?(?:(?:25[0-5]|2[0-4][0-9]". "|[0-1]?[0-9]?[0-9])(?:\.(?:25[0-5]|2[0-4][0-9]|[0-1]?". "[0-9]?[0-9])){3}|(?:[a-z0-9-]{2,}\.)+[a-z]{2,})[^ \n\r\t\?\"'\|]*.*?(?:\|[^\]]+)?)\]?@i", $element, -1, PREG_SPLIT_DELIM_CAPTURE );
for($k = 0; $k < count($more); $k++)
{
if(preg_match("@(((?:https?|ftps?)://)?(?:(?:25[0-5]|2[0-4][0-9]". "|[0-1]?[0-9]?[0-9])(?:\.(?:25[0-5]|2[0-4][0-9]|[0-1]?". "[0-9]?[0-9])){3}|(?:[a-z0-9-]{2,}\.)+[a-z]{2,})[^ \n\r\t\?\"'\|\]]*).*?(?:\|([^\]]+))?@i",$more[$k],$match))
{
$tree[] = array('element'=>'link','url'=>$match[1],'text'=>(empty($match[3]) ? $match[1] : $match[3]));
//var_dump($match);
}
else
{
$evenmore = bbTokenize($more[$k]);
array_splice($tree, count($tree), 0, $evenmore);
}
}
}
/*
elseif(preg_match())
{ // emails
;
}
*/
elseif(preg_match('@^(\s*)$@s',$element))
{ // element is entirely white space.. ignore it.
}
else
{ // final content format processing
$element = htmlentities((substr($element,0,1) == ' ' ? $element : ' '.$element),ENT_QUOTES);
//var_dump($element);
foreach($complex as $c => $tag)
{
//var_dump('@'.$c.'((?:'.$c.$c.'){0,}(?=[a-z]).*?(?<=[a-z]))'.$c.'(?:'.$c.$c.'){0,}(?![a-z])@si');
$element = preg_replace( '@'.$c.'((?:'.$c.$c.'){0,}(?=[\S]).*?(?<=[\S]))'.$c.'(?:'.$c.$c.'){0,}(?![\w\d])@si', '<'.$tag.'>\\1</'.$tag.'>', $element );
//$element = preg_replace( '@(?<=\s)([^\w\d]*)'.$c.'((?:'.$c.$c.'){0,}(?=[\S]).*?(?<=[\S]))'.$c.'(?:'.$c.$c.'){0,}(?![\w\d])([^\w\d]*)(?=\s)@si', '\\1<'.$tag.'>\\2</'.$tag.'>\\3', $element );
}
//var_dump($element);
foreach($complex as $c => $tag)
{
$element = str_replace($c.$c,$c,$element);
}
$element = nl2br($element);
//var_dump($element);
//$element = preg_replace('@(?<=\[)\[(?:https?|ftps?)://(?:
$tree[] = array('element'=>'done','content'=>substr($element,1));
}
}
return $tree;
}
function bbMessageProcess($text, $options)
{
if(isset($options['cocode']) and $options['cocode'])
{
// first we tokenize if we are doing CoCode
$tree = bbTokenize($text);
// what elements to map where.. (specialty ones only)
$elements = array(
'noop'=>'',
'done'=>'',
);
$text = '';
// now we create the document using your options
foreach($tree as $key => $v)
{
$branch =& $tree[$key];
//var_dump($branch);
if(isset($branch['element']))
{
$element = (isset($elements[$branch['element']]) ? $elements[$branch['element']] : $branch['element']);
if($element == 'link')
{
if($branch['text'] == $branch['url'])
{
$tooltip = $branch['text'];
$parse = parse_url($branch['url']);
$txt = isset($parse['host']) ? $parse['host'] : 'localhost';
$url = $branch['url'];
$pre = '[';
$post = ']';
}
else
{
$tooltip = '';
$txt = $branch['text'];
$parse = parse_url($branch['url']);
$url = $branch['url'];
$pre = '';
$post = ' ['.$parse['host'].']';
}
$text .= $pre.bbClickable(array('text'=>$txt,'tooltip'=>$tooltip,'data'=>array('url'=>$url))).$post;
}
elseif(isset($branch['type']))
{
if($branch['type'] == 'start')
{
$text .= (empty($element) ? '' : '<'.$element.'>');
}
elseif($branch['type'] == 'stop')
{
$text .= (empty($element) ? '' : '</'.$element.'>');
}
}
elseif($element == 'done')
{ // do nothing
}
elseif(!empty($element))
{
$text .= '<'.$element.' />';
}
if(isset($branch['content']))
{
$text .= $branch['content'];
}
}
}
}
else
{ // basic conversion so no one can post malicious code.
$text = nl2br(htmlentities($text,ENT_QUOTES));
if(isset($options['smilies']) and $options['smilies'])
{
$text = bbDoSmilies($text);
}
}
// this must return an array for use in the appcreate platform :/
return array('processed'=>$text);
}
?>