Well if it's any help here's the start of a generic parser I'm working on. It really needs so speed increases but it's a base. It wold be a lot faster if it wasn't so generic. You use configuration files (that array) to define your token definitions. I'll be extending it to make a generic lexical analyzer too.
Code: Select all
//Types should be listed in order of precedence
// For example, look for strings before variables since a variable inside a string is not valid
$tokenTypes = array(
'TK_ESCAPE_CHARACTER' => array('\\\\', 0),
'TK_DOUBLE_STRING' => array('@(?<!\\\\)".*?(?<!\\\\)"@s', 1),
'TK_LITERAL_STRING' => array("@(?<!\\\\)'.*?(?<!\\\\)'@s", 1),
'TK_COMMENT' => array('@(?<!\\\\)/\\*(.*?)\\*/|//.*?$|#.*?$@sm', 1),
'TK_VARIABLE' => array('@\\$[a-z_]\w*@i', 1),
'TK_CLASS' => array('@\bclass\b@i', 1),
'TK_FUNCTION' => array('@\b(?:c)?function\b@i', 1),
'TK_INTERFACE' => array('@\binterface\b@i', 1),
'TK_ECHO' => array('@\becho\b@i', 1),
'TK_PRINT' => array('@\bprint\b@i', 1),
'TK_EXIT' => array('@\bexit\b@i', 1),
'TK_DIE' => array('@\bdie\b@i', 1),
'TK_OPEN_TAG_WITH_ECHO' => array('@<\?=@i', 1),
'TK_OPEN_TAG' => array('@<\?(?:php)?@i', 1),
'TK_CLOSE_TAG' => array('?>', 0),
'TK_ARRAY_CAST' => array('@\([ \t]*array[ \t]*\)@i', 1),
'TK_DOUBLE_CAST' => array('@\(\s*(?:double|float|real)\s*\)@i', 1),
'TK_AND_EQUAL' => array('&=', 0),
'TK_OBJECT_OPERATOR' => array('->', 0),
'TK_DOUBLE_ARROW' => array('=>', 0),
'TK_APPEND_OPERATOR' => array('.=', 0),
'TK_NOT_EQUAL' => array('!=', 0),
'TK_NOT_IDENTICAL' => array('!==', 0),
'TK_BOOLEAN_AND' => array('&&', 0),
'TK_BOOLEAN_OR' => array('||', 0),
'TK_INC' => array('++', 0),
'TK_DEC' => array('--', 0),
'TK_IS_IDENTICAL' => array('===', 0),
'TK_IS_EQUAL' => array('==', 0),
'TK_LESS_THAN_OR_EQUAL' => array('<=', 0),
'TK_GREATER_THAN_OR_EQUAL' => array('>=', 0),
'TK_BITWISE_LEFT_SHIFT' => array('<<', 0),
'TK_BITWISE_RIGHT_SHIFT' => array('>>', 0),
'TK_EQUALS' => array('=', 0),
'TK_RIGHT_PAREN' => array(')', 0),
'TK_LEFT_PAREN' => array('(', 0),
'TK_COMMA' => array(',', 0),
'TK_CONCAT_OPERATOR' => array('.', 0),
'TK_GREATER_THAN' => array('>', 0),
'TK_LESS_THAN' => array('<', 0),
'TK_REFERENCE_OPERATOR' => array('&', 0),
'TK_LEFT_BRACKET' => array('[', 0),
'TK_RIGHT_BRACKET' => array(']', 0),
'TK_COLON' => array(':', 0),
'TK_SEMICOLON' => array(';', 0),
'TK_NEGATION_OPERATOR' => array('!', 0),
'TK_RIGHT_BRACE' => array('}', 0),
'TK_LEFT_BRACE' => array('{', 0),
'TK_PLUS' => array('+', 0),
'TK_MINUS' => array('-', 0),
'TK_HEX_NUMERAL' => array('@0x[a-f0-9]+@i', 1),
'TK_DECIMAL_OR_FLOAT' => array('@\d+\.\d+@', 1),
'TK_OCT_NUMERAL' => array('@0\d+@', 1),
'TK_INTEGER_NUMERAL' => array('@\d+@', 1),
'TK_IF' => array('@\bif\b@i', 1),
'TK_ELSE' => array('@\belse\b@i', 1),
'TK_ELSEIF' => array('@\belseif\b@i', 1),
'TK_ARRAY' => array('@\barray\b@', 1),
'TK_AS' => array('@\bas\b@i', 1),
'TK_PUBLIC' => array('@\bpublic\b@i', 1),
'TK_PRIVATE' => array('@\bprivate\b@i', 1),
'TK_PROTECTED' => array('@\bprotected\b@i', 1),
'TK_VAR' => array('@\bvar\b@i', 1),
'TK_STATIC' => array('@\bstatic\b@', 1),
'TK_EXTENDS' => array('@\bextends\b@i', 1),
'TK_IMPLEMENTS' => array('@\bimplements\b@i', 1),
'TK_CASE' => array('@\bcase\b@i', 1),
'TK_WHITESPACE' => array('@\s+@', 1),
'TK_UNQUOTED_STRING' => array('@\w+@', 1), //Class names, function names, constants (The lexer will deal with this)
'TK_UNKNOWN' => array('@\W@', 1)
);
$lex = new lexer(file_get_contents('index.php'));
$lex->addDefinitions($tokenTypes);
$lex->setInertTokens(array(TK_WHITESPACE));
$lex->tokenize();
$tok = $lex->getTokens();
foreach ($tok as $k => $arr)
{
if ($arr['type'] == TK_COMMENT || $arr['type'] == TK_WHITESPACE)
{
unset($tok[$k]);
}
}
echo '<table cellpadding=3 border=1>
<tr>
<td><b>Token Type</b></td>
<td><b>Token</b></td>
<td><b>Offset</b></td>
</tr>
';
foreach ($tok as $arr)
{
echo '<tr>
<td>'.$lex->getTokenName($arr['type']).'</td>
<td><div style="overflow: hidden; width: 500px;">'.nl2br(htmlentities($arr['token'])).'</div></td>
<td>'.$arr['offset'].'</td>
</tr>
';
}
echo '</table>';
In the array a "1" means that it's a regex string, and a zero means it's a static string.
EDIT | Updated code as per my updated version. Now approx 15% faster @ 6900 bytes, 70 token defintions. (Main change == using constants/numbers rather than strings)