I've gone down a route like this (quite complex, but very flexible). I'm still tweaking things and trying to get bits and pieces to work but I'll post a finished version in critique if it's not miserably slow. It can find pathways through tokens, knowing what to expect next, which was my main objective. PCRE is only used where needed, whitespace is pretty much ignored, groups of tokens can be generalized, segments of tokens can be encapsed (i.e. between /* and */ or " and " etc) and the string is now read byte-for-byte.
Just getting the basics working first, then hopefully I can just throw in all the definitions and it will just analyze it (hopefully).
This doesn't work properly if you try to run it, just showing where I'm aiming (grrr... I should comment more while I'm coding).
Pretty complex definition files like this:
Code: Select all
<?php
class PhpDefinition extends Definition
{
private $toknames = array(
'ESCAPE_CHAR', 'DOUBLE_QUOTE',
'SINGLE_QUOTE', 'BACKTICK',
'DOUBLE_SLASH', 'HASH_SIGN',
'ML_COMMENT_OPEN', 'ML_COMMENT_CLOSE',
'LF', 'CRLF',
'CLASS', 'FUNCTION',
'EXTENDS', 'IMPLEMENTS',
'STRING', 'ENCAPSED_STRING',
'ML_COMMENT', 'COMMENT',
'LEFT_CURLY', 'RIGHT_CURLY',
'EOL', 'PHP_TAG_OPEN',
'PHP_TAG_CLOSE', 'VARIABLE'
);
public function __construct()
{
foreach ($this->toknames as $int => $tok)
{
if (!defined('_t'.$tok)) define('_t'.$tok, $int);
}
}
public function getTokenTypes()
{
return $this->toknames;
}
public function getTokenDefs()
{
return array(
_tESCAPE_CHAR => '\\', _tDOUBLE_QUOTE => '"',
_tSINGLE_QUOTE => "'", _tBACKTICK => '`',
_tDOUBLE_SLASH => '//', _tHASH_SIGN => '#',
_tML_COMMENT_OPEN => '/*', _tML_COMMENT_CLOSE => '*/',
_tLF => "\n", _tCRLF => "\r\n",
_tCLASS => 'class', _tFUNCTION => 'function',
_tEXTENDS => 'extends', _tIMPLEMENTS => 'implements',
_tSTRING => new DefPattern('@^\w+$@D'), _tENCAPSED_STRING => new DefGroup(
new DefBlock(_tDOUBLE_QUOTE, _tDOUBLE_QUOTE),
new DefBlock(_tSINGLE_QUOTE, _tSINGLE_QUOTE)),
_tCOMMENT => new DefGroup(
new DefBlock(_tDOUBLE_SLASH, _tEOL),
new DefBlock(_tHASH_SIGN, _tEOL),
_tML_COMMENT), _tML_COMMENT => new DefBlock(_tML_COMMENT_OPEN, _tML_COMMENT_CLOSE),
_tLEFT_CURLY => '{', _tRIGHT_CURLY => '}',
_tEOL => new DefGroup(_tLF, _tCRLF), _tPHP_TAG_OPEN => '<?php',
_tPHP_TAG_CLOSE => '?>', _tVARIABLE => new DefPattern('/^\$[a-z_]+[a-z0-9_]*$/iD')
);
}
public function getEscapeTokens()
{
return _tESCAPE_CHAR;
}
public function getStringTokens()
{
return _tSTRING;
}
public function getEncapsedStringTokens()
{
return _tENCAPSED_STRING;
}
public function getCommentTokens()
{
return _tCOMMENT;
}
public function getTokenSequences()
{
return array(
NULL => _tPHP_TAG_OPEN,
_tPHP_TAG_OPEN => new DefGroup(_tCLASS, _tVARIABLE, _tFUNCTION),
_tCLASS => new DefSequence(array(_tSTRING => new DefGroup(_tEXTENDS, _tLEFT_CURLY, _tIMPLEMENTS))),
_tEXTENDS => new DefSequence(array(_tSTRING => new DefGroup(_tLEFT_CURLY, _tIMPLEMENTS))),
_tIMPLEMENTS => new DefSequence(array(_tSTRING => new DefGroup(_tEXTENDS, _tLEFT_CURLY)))
);
}
}
?>
The lexing stuff:
Code: Select all
<?php
/**
* Defines the methods which all definitions need to contain
*/
abstract class Definition
{
abstract public function getTokenTypes();
abstract public function getTokenDefs();
abstract public function getTokenSequences();
abstract public function getEscapeTokens();
abstract public function getStringTokens();
abstract public function getEncapsedStringTokens();
abstract public function getCommentTokens();
}
/**
* Holds a PCRE type pattern to define a token
*/
class DefPattern
{
/**
* The PCRE pattern
* @var string pattern
*/
private $pattern;
/**
* Constructor
* @param string pattern
*/
public function __construct($pattern)
{
$this->pattern = (string) $pattern;
}
/**
* Get this pattern
* @return string pattern
*/
public function get()
{
return $this->pattern;
}
}
/**
* Defines a token which runs from a token of type $start to a token of type $end
* All characters between the $start and $end are considered encapsed and will not
* be defined separately.
*/
class DefBlock
{
/**
* The token type at which this block starts
* @var int token
*/
private $start;
/**
* The token type at which this block ends
* @var int token
*/
private $end;
/**
* Constructor
* @param int token start
* @param int token end
*/
public function __construct($start, $end)
{
$this->start = $start;
$this->end = $end;
}
/**
* Get the starting token
* @return int token
*/
public function getStart()
{
return $this->start;
}
/**
* Get the ending token
* @return int token
*/
public function getEnd()
{
return $this->end;
}
}
/**
* Generalizes a group of tokens as a single entity
* E.g. 'string' and "string" of types DefBlock(SGL_QT, SGL_QT) and DefBlock(DBL_QT, DBL_QT)
* into a group DefGroup ENCAPSED_STRING.
*/
class DefGroup
{
/**
* The token collection
* @var array <int> tokens
*/
public $types = array();
/**
* Constructor
* @param int token
* @param int token, .....
*/
public function __construct()
{
$this->types = func_get_args();
}
/**
* Check if this group contains a given token
* @param int token
* @return bool
*/
public function has($token)
{
if (in_array($this->types, $token)) return true;
else return false;
}
public function get()
{
return $this->types;
}
}
/**
* Group sequences of tokens in a set order as one
* E.g. 'class foo extends' and 'class foo {' and 'class foo implements'
*/
class DefSequence
{
private $sequence;
public function __construct()
{
$this->sequence = func_get_args();
}
public function get()
{
return $this->sequence;
}
}
if (!defined('_lexWHITESPACE')) define('_lexWHITESPACE', '/\s+/');
if (!defined('_lexWHITESPACE_NUM')) define('_lexWHITESPACE_NUM', -1);
/**
* The main lexical analyzer
* Reads a set of definitions contained in a subclass of Definition
* Scans a given excerpt of source code and produces a list of tokens
*/
class LexScan
{
private $source;
private $sourceLength = 0;
private $escapechar;
private $string;
private $def;
private $locatedTokens = array();
private $sequences;
private $expectNext;
private $currentLine = 1;
private $setNext = true;
public function __construct(Definition $definition)
{
$this->def = $definition;
$this->escapechar = $definition->getEscapeTokens();
$this->string = $definition->getStringTokens();
$this->encapsedString = $definition->getEncapsedStringTokens();
$this->sequences = $definition->getTokenSequences();
$this->types = $definition->getTokenDefs();
}
public function setSource($source)
{
$this->source = (string) $source;
$this->sourceLength = strlen($this->source);
}
public function scan()
{
$instring = false;
$incomment = false;
$commenttype = false;
$ignorechar = false;
$token = "";
$this->expectNext = $this->sequences[NULL];
for ($pos = 0; $pos < $this->sourceLength; $pos++)
{
$char = substr($this->source, $pos, 1);
if ($char == "\n") $this->currentLine++;
if (!$ignorechar && $this->tokenMatch($char, $this->escapechar)) $ignorechar = true;
if (($match = $this->tokenMatch($token.$char, $this->expectNext)) && !$instring && !$incomment)
{
if (($match['type'] == $this->string)
&& (@$this->locatedTokens[sizeof($this->locatedTokens)-1][0] == $this->string))
{
$this->locatedTokens[sizeof($this->locatedTokens)-1][1] .= $match['token'];
}
else $this->locatedTokens[] = array($match['type'], $match['token']);
if ($this->setNext) $this->expectNext = $this->sequences[$match['type']];
$token = "";
}
else if (preg_match(_lexWHITESPACE, $char) && !$instring && !$incomment)
{
if (@$this->locatedTokens[sizeof($this->locatedTokens)-1][0] != _lexWHITESPACE_NUM)
$this->locatedTokens[] = array(_lexWHITESPACE_NUM, '');
}
else $token .= $char;
$this->setNext = true;
}
}
private function tokenMatch($string, $token)
{
if (is_int($token)) $def = $this->types[$token];
else $def = $token;
if (is_string($def))
{
if ($string == $def) return array('type' => $token, 'token' => $string);
else return false;
}
else if ($def instanceof DefPattern)
{
if (preg_match($def->get(), $string)) return array('type' => $token, 'token' => $string);
else return false;
}
else if ($def instanceof DefGroup)
{
foreach ($def->get() as $tokdef)
{
if ($match = $this->tokenMatch($string, $tokdef))
{
return $match;
}
}
return false;
}
else if ($def instanceof DefSequence)
{
foreach ($def->get() as $sequence)
{
foreach ($sequence as $tokdef => $next_tokens)
{
if ($match = $this->tokenMatch($string, $tokdef))
{
if ($match['type'] == $this->string) $this->expectNext = $def;
else $this->expectNext = $next_tokens;
$this->setNext = false;
return $match;
}
}
}
return false;
}
else return false;
}
}
?>
There's smelly bits of code in there which need refactoring but I'm glad the whole idea hasn't fallen flat on it's face so far.
EDIT | I looked at SimpleTest's Mock library... no lexing here... just building from values of get_class_methods().