I have written an XML parser for a fairly small file (115kb). I initially wrote it using an OO approach, and was concerned by how long the page took to load (~4-5 seconds). I thought maybe the post-processing (I read it into an array, then process through it, adding output, formatting, etc) was slowing it down, so I cut that all out, and it still took quite a long time. I decided to whip up a second version that removed the OO style and switch to a functional approach, and it cut the time down to ~1 second! A huge improvement that I was not expecting. I did clear the cache and tested the loading time to make sure I was getting a reliable answer.
Does this make sense to others? I would like to stick with a more OO approach as it is easier to structure and expand should I need to, but not if it is going to cost me this extra time. Is the accessing of the object properties that much slower to do than simply accessing a variable or am I doing something in the OO code that is causing extra instances of the class to be created?
The OO approach
Code: Select all
class PubsParser
{
private $xmlDebug;
private $xmlDebugOutput;
private $p;
private $inEntry;
private $publication;
private $data;
private $curAuthor;
private $inBook;
private $pubLevel;
private $curPub;
private $curPubCount;
private $publications;
const NO_PUB_TYPE = 'No publication type declared';
public function __construct($xmlDebug=false, $xmlDebugOutput=false)
{
$this->inEntry = false;
$this->data = "";
$this->pubLevel = 0;
$this->curPubCount = 0;
$this->publications = array();
$this->xmlDebug = $xmlDebug;
$this->xmlDebugOutput = $xmlDebugOutput;
}
public function debugPrint($s)
{
if($this->xmlDebugOutput)
{
echo $s;
}
}
public function parseFile($file="pubs.xml")
{
$this->p = xml_parser_create();
xml_parser_set_option($this->p, XML_OPTION_SKIP_WHITE, 1);
xml_parser_set_option($this->p, XML_OPTION_CASE_FOLDING, 0);
if(!$this->xmlDebug)
{
xml_set_object($this->p, $this);
xml_set_element_handler($this->p, 'startElement', 'endElement');
xml_set_character_data_handler($this->p, 'contents');
$fp = fopen($file, "r") or die("Could not open file");
while($data = fread($fp, filesize($file)))
{
if(!xml_parse($this->p, $data, feof($fp)))
{
die(sprintf("XML error: %s at line %d",
xml_error_string(xml_get_error_code($this->p)),
xml_get_current_line_number($this->p)));
}
}
}
else
{
xml_parse_into_struct($this->p, implode("", file($file)), $val, $inx);
print_r($val);
print_r($inx);
}
}
public function getPubs()
{
return $this->publications;
}
public function startElement($p, $element, $attrib)
{
$this->data;
$this->debugPrint("starting element: $element with data: *$this->data*".PHP_EOL);
switch($element)
{
case 'publication':
$this->inEntry = true;
$type = array_key_exists('type', $attrib) ? $attrib['type'] : die(PubsParser::NO_PUB_TYPE." on line: ".xml_get_current_line_number($p));
$this->curPubCount++;
switch($type)
{
case 'article':
$this->curPub = new Article();
break;
case 'bookcontrib':
$this->curPub = new BookContrib();
break;
case 'patent':
$this->curPub = new Patent();
break;
case 'patentpub':
$this->curPub = new PatentPub();
break;
}
$this->publication[] = $this->curPub;
break;
case 'publist':
break;
default:
if($this->inEntry)
{
switch($element)
{
case 'author':
$this->curAuthor = array();
break;
case 'book':
$this->inBook = true;
$this->curPub->setBook(new Book());
$this->publication[] = $this->curPub->getBook();
$this->curPub = $this->curPub->getBook();
break;
}
}
else
{
echo $element;
echo 'not in entry';
}
break;
}
}
public function endElement($p, $element)
{
$this->debugPrint("ending element: $element with data of: $data".PHP_EOL);
if($element == 'publication')
{
$this->curPub = array_pop($this->publication);
$this->publications[] = $this->curPub;
}
$curPub = $this->curPub;
$data = $this->data;
$data = trim($data);
switch($element)
{
case 'title':
$curPub->setTitle($data);
break;
case 'author':
$curPub->addAuthor($this->curAuthor);
break;
case 'first':
case 'middle':
case 'last':
case 'suffix':
$this->curAuthor[$element] = $data;
break;
case 'journal':
$curPub->setJournal($data);
break;
case 'year':
$curPub->setYear($data);
break;
case 'volume':
$curPub->setVolume($data);
break;
case 'spage':
case 'epage':
$curPub->addPage($data);
break;
case 'note':
$curPub->setNote($data);
break;
case 'book':
array_pop($this->publication);
$curPub = $this->publication[count($this->publication)-1];
break;
case 'publisher':
$curPub->setPublisher($data);
break;
case 'location':
$curPub->setLocation($data);
break;
case 'series':
$curPub->setSeries($data);
break;
case 'country':
$curPub->setCountry($data);
break;
case 'patentnum':
$curPub->setPatentNum($data);
break;
case 'date':
$curPub->setDate($data);
break;
case 'pagecount':
$curPub->setPageCount($data);
break;
case 'link':
$curPub->setLink($data);
break;
case 'patenttype':
$curPub->setPatentType($data);
break;
case 'volsupplement':
$curPub->setVolSupplement($data);
break;
}
$this->data = "";
}
public function contents($p, $content)
{
$this->debugPrint("writing contents: *$content*; to data: *$this->data*".PHP_EOL);
$this->data .= $content;
$this->debugPrint("data now contains: *$this->data*".PHP_EOL);
}
}Code: Select all
$xmlDebugOutput = false;
$xmlDebug = false;
$data = "";
$inEntry = false;
$curPubCount = 0;
$curPub;
$publication;
$curAuthor;
$inBook;
$publications = array();
DEFINE('NO_PUB_TYPE','No publication type declared');
function parseFile()
{
global $xmlDebug;
$p = xml_parser_create();
$file = "pubs.xml";
xml_parser_set_option($p, XML_OPTION_SKIP_WHITE, 1);
xml_parser_set_option($p, XML_OPTION_CASE_FOLDING, 0);
if(!$xmlDebug)
{
xml_set_element_handler($p, 'startElement', 'endElement');
xml_set_character_data_handler($p, 'contents');
$fp = fopen($file, "r") or die("Could not open file");
while($data = fread($fp, filesize($file)))
{
if(!xml_parse($p, $data, feof($fp)))
{
die(sprintf("XML error: %s at line %d",
xml_error_string(xml_get_error_code($p)),
xml_get_current_line_number($p)));
}
}
}
else
{
xml_parse_into_struct($p, implode("", file($file)), $val, $inx);
print_r($val);
print_r($inx);
}
}
function startElement($p, $element, $attrib)
{
global $data, $inEntry, $curPubCount, $curPub, $publication, $curAuthor, $inBook;
$data = "";
debugPrint("starting element: $element with data: *$data*".PHP_EOL);
switch($element)
{
case 'publication':
$inEntry = true;
$type = array_key_exists('type', $attrib) ? $attrib['type'] : die(NO_PUB_TYPE." on line: ".xml_get_current_line_number($p));
$curPubCount++;
switch($type)
{
case 'article':
$curPub = new Article();
break;
case 'bookcontrib':
$curPub = new BookContrib();
break;
case 'patent':
$curPub = new Patent();
break;
case 'patentpub':
$curPub = new PatentPub();
break;
}
$publication[] = $curPub;
break;
case 'publist':
break;
default:
if($inEntry)
{
switch($element)
{
case 'author':
$curAuthor = array();
break;
case 'book':
$inBook = true;
$curPub->setBook(new Book());
$publication[] = $curPub->getBook();
$curPub = $curPub->getBook();
break;
}
}
else
{
echo $element;
echo 'not in entry';
}
break;
}
}
function endElement($p, $element)
{
global $curPub, $publications, $data, $curAuthor, $publication;
debugPrint("ending element: $element with data of: $data".PHP_EOL);
if($element == 'publication')
{
$curPub = array_pop($publication);
$publications[] = $curPub;
}
$curPub = $curPub;
$data = $data;
$data = trim($data);
switch($element)
{
case 'title':
$curPub->setTitle($data);
break;
case 'author':
$curPub->addAuthor($curAuthor);
break;
case 'first':
case 'middle':
case 'last':
case 'suffix':
$curAuthor[$element] = $data;
break;
case 'journal':
$curPub->setJournal($data);
break;
case 'year':
$curPub->setYear($data);
break;
case 'volume':
$curPub->setVolume($data);
break;
case 'spage':
case 'epage':
$curPub->addPage($data);
break;
case 'note':
$curPub->setNote($data);
break;
case 'book':
array_pop($publication);
$curPub = $publication[count($publication)-1];
break;
case 'publisher':
$curPub->setPublisher($data);
break;
case 'location':
$curPub->setLocation($data);
break;
case 'series':
$curPub->setSeries($data);
break;
case 'country':
$curPub->setCountry($data);
break;
case 'patentnum':
$curPub->setPatentNum($data);
break;
case 'date':
$curPub->setDate($data);
break;
case 'pagecount':
$curPub->setPageCount($data);
break;
case 'link':
$curPub->setLink($data);
break;
case 'patenttype':
$curPub->setPatentType($data);
break;
case 'volsupplement':
$curPub->setVolSupplement($data);
break;
}
$data = "";
}
function contents($p, $content)
{
global $data;
debugPrint("writing contents: *$content*; to data: *$data*".PHP_EOL);
$data .= $content;
debugPrint("data now contains: *$data*".PHP_EOL);
}
function debugPrint($s)
{
global $xmlDebugOutput;
if($xmlDebugOutput)
{
echo $s;
}
}
Code: Select all
if(isset($_GET['func']))
{
require_once("exPubs.php");
parseFile();
$pubs = $publications;
}
else
{
require_once("PubsParser.php");
$p = new PubsParser();
$p->parseFile();
$pubs = $p->getPubs();
}
foreach($pubs as $pub)
{
$pub->printPublication();
}