What's the percentage of execution time eaten by the HTMLPurifier classes involved here in terms of the execution time for this function?
Note that I'm running this on a not-very-powerful computer.
Class->Function: Avg Self, Avg Cum, Total Self, Total Cum, Calls
HTMLPurifier_Lexer_DOMLex->tokenizeDOM: 5.3ms; 30ms; 18,706ms; 105,201ms; 3560
HTMLPurifier_TokenFactory->createEnd: 0.1ms; 0.3ms; 140ms; 407ms; 1,290
HTMLPurifier_TokenFactory->createStart: -; 0.3ms; 126ms; 409ms; 1,290
HTMLPurifier_TokenFactory->createText: 0.1ms; 0.3ms; 269ms; 709ms; 2,258
HTMLPurifier_Token_Tag->HTMLPurifier_Token_Tag: 0.2ms; 0.2ms; 530ms; 551ms; 2,597
HTMLPurifier_Token_Text->HTMLPurifier_Token_Text: 0.2ms; 0.2ms; 438ms; 445ms; 2303
Might be a good idea to see the new code, if you plan to keep it.
Code: Select all
protected function tokenizeDOM($node, $tokens = array(), $collect = false) {
// recursive goodness!
// intercept non element nodes
if ( !($node instanceof DOMElement) ) {
if ($node instanceof DOMComment) {
$tokens[] = $this->factory->createComment($node->data);
} elseif ($node instanceof DOMText ||
$node instanceof DOMCharacterData) {
$tokens[] = $this->factory->createText($node->data);
}
// quite possibly, the object wasn't handled, that's fine
return $tokens;
}
// We still have to make sure that the element actually IS empty
if (!$node->hasChildNodes()) {
if ($collect) {
$tokens[] = $this->factory->createEmpty(
$node->tagName,
$this->transformAttrToAssoc($node->attributes)
);
}
} else {
if ($collect) { // don't wrap on first iteration
$tokens[] = $this->factory->createStart(
$tag_name = $node->tagName, // somehow, it get's dropped
$this->transformAttrToAssoc($node->attributes)
);
}
foreach ($node->childNodes as $node) {
// remember, it's an accumulator. Otherwise, we'd have
// to use array_merge
$tokens = $this->tokenizeDOM($node, $tokens, true);
}
if ($collect) {
$tokens[] = $this->factory->createEnd($tag_name);
}
}
return $tokens;
}
Contents of $this->factory
Code: Select all
<?php
require_once 'HTMLPurifier/Token.php';
class HTMLPurifier_TokenFactory
{
// p stands for prototype
private $p_start, $p_end, $p_empty, $p_text, $p_comment;
public function __construct() {
$this->p_start = new HTMLPurifier_Token_Start('', array());
$this->p_end = new HTMLPurifier_Token_End('');
$this->p_empty = new HTMLPurifier_Token_Empty('', array());
$this->p_text = new HTMLPurifier_Token_Text('');
$this->p_comment= new HTMLPurifier_Token_Comment('');
}
public function createStart($name, $attributes = array()) {
$p = clone $this->p_start;
$p->HTMLPurifier_Token_Tag($name, $attributes);
return $p;
}
public function createEnd($name) {
$p = clone $this->p_end;
$p->HTMLPurifier_Token_Tag($name);
return $p;
}
public function createEmpty($name, $attributes = array()) {
$p = clone $this->p_empty;
$p->HTMLPurifier_Token_Tag($name, $attributes);
return $p;
}
public function createText($data) {
$p = clone $this->p_text;
$p->HTMLPurifier_Token_Text($data);
return $p;
}
public function createComment($data) {
$p = clone $this->p_comment;
$p->HTMLPurifier_Token_Comment($data);
return $p;
}
}
?>
Also, the tokens have constructors which do take up some execution time:
Code: Select all
<?php
/**
* Defines a set of immutable value object tokens for HTML representation.
*
* @file
*/
/**
* Abstract base token class that all others inherit from.
*/
class HTMLPurifier_Token {
var $type; /**< Type of node to bypass <tt>is_a()</tt>. @public */
}
/**
* Abstract class of a tag token (start, end or empty), and its behavior.
*/
class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
{
/**
* Static bool marker that indicates the class is a tag.
*
* This allows us to check objects with <tt>!empty($obj->is_tag)</tt>
* without having to use a function call <tt>is_a()</tt>.
*
* @public
*/
var $is_tag = true;
/**
* The lower-case name of the tag, like 'a', 'b' or 'blockquote'.
*
* @note Strictly speaking, XML tags are case sensitive, so we shouldn't
* be lower-casing them, but these tokens cater to HTML tags, which are
* insensitive.
*
* @public
*/
var $name;
/**
* Associative array of the tag's attributes.
*/
var $attributes = array();
/**
* Non-overloaded constructor, which lower-cases passed tag name.
*
* @param $name String name.
* @param $attributes Associative array of attributes.
*/
function HTMLPurifier_Token_Tag($name, $attributes = array()) {
//if ($attributes === null) var_dump(debug_backtrace());
$this->name = ctype_lower($name) ? $name : strtolower($name);
foreach ($attributes as $key => $value) {
// normalization only necessary when key is not lowercase
if (!ctype_lower($key)) {
$new_key = strtolower($key);
if (!isset($attributes[$new_key])) {
$attributes[$new_key] = $attributes[$key];
}
if ($new_key !== $key) {
unset($attributes[$key]);
}
}
}
$this->attributes = $attributes;
}
}
/**
* Concrete start token class.
*/
class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag
{
var $type = 'start';
function copy() {
return new HTMLPurifier_Token_Start($this->name, $this->attributes);
}
}
/**
* Concrete empty token class.
*/
class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
{
var $type = 'empty';
function copy() {
return new HTMLPurifier_Token_Empty($this->name, $this->attributes);
}
}
/**
* Concrete end token class.
*
* @warning This class accepts attributes even though end tags cannot. This
* is for optimization reasons, as under normal circumstances, the Lexers
* do not pass attributes.
*/
class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
{
var $type = 'end';
function copy() {
return new HTMLPurifier_Token_End($this->name);
}
}
/**
* Concrete text token class.
*
* Text tokens comprise of regular parsed character data (PCDATA) and raw
* character data (from the CDATA sections). Internally, their
* data is parsed with all entities expanded. Surprisingly, the text token
* does have a "tag name" called #PCDATA, which is how the DTD represents it
* in permissible child nodes.
*/
class HTMLPurifier_Token_Text extends HTMLPurifier_Token
{
var $name = '#PCDATA'; /**< PCDATA tag name compatible with DTD. @public */
var $type = 'text';
var $data; /**< Parsed character data of text. @public */
var $is_whitespace; /**< Bool indicating if node is whitespace. @public */
/**
* Constructor, accepts data and determines if it is whitespace.
*
* @param $data String parsed character data.
*/
function HTMLPurifier_Token_Text($data) {
$this->data = $data;
$this->is_whitespace = ctype_space($data);
}
function copy() {
return new HTMLPurifier_Token_Text($this->data);
}
}
/**
* Concrete comment token class. Generally will be ignored.
*/
class HTMLPurifier_Token_Comment extends HTMLPurifier_Token
{
var $data; /**< Character data within comment. @public */
var $type = 'comment';
/**
* Transparent constructor.
*
* @param $data String comment data.
*/
function HTMLPurifier_Token_Comment($data) {
$this->data = $data;
}
function copy() {
return new HTMLPurifier_Token_Comment($this->data);
}
}
?>