For a class project I am creating an Inverted Index from the TIME magazine article collection. This will, in time, be used for a search engine. As you can see from the code I've posted below that this is complete. However, since the article collection is fairly large (423 articles) I would like to look at different ways of optimizing my code.
As it stands, this code executes on my server at ~11.5 seconds.
Does anyone have any suggestions on my use of file_get_contents(), preg_replace(), strtok() methods, or the use of two-dimensional arrays that could help me out in shaving off some seconds? I know it's a lot to read but any help is appreciated.
Thanks in advance.
Code: Select all
<?php
include('stemmer.class.inc');
$docID = 0; //actual article number in sequence
$textDocID = 0; //article number from TIME.ALL
$index = 0; //dictionary term index
$dictionary = array();
$stopwords = array(" i ", " a ", " about ", " an ", " and ", " are ", " as ", " at ", " be ", " but ", " by ", " com ", " de ", " en ", " for ", " from ", " has ", " how ", " in ", " is ", " it ", " la ", " of ", " on ", " or ", " that ", " the ", " this ", " to ", " was ", " what ", " when ", " where ", " who ", " will ", " with ", " und ", " the ", " www ");
$time_start = microtime(true);
///////
$filename = "./test_archive/TIME.ALL";
$fileString = file_get_contents($filename);
$fileString = preg_replace("/\n\r|\n|\r/"," ",$fileString); //remove line breaks
$word = strtok($fileString, " "); //start tokenizing
while ($word) {
/*
Format of file:
docID date pageNum
^ ^ ^
*TEXT ### ##/##/## PAGE ###
... article contents ...
*TEXT ### ##/##/## PAGE ###
... more articles...
*STOP
*/
if (strcmp("*TEXT", $word) == 0) { //if article is found, process it
$articleString = "";
$docID++; //increase sequential article number
$word = strtok(" "); //next word is article number or docID
$textDocID = $word;
//echo "\nStart of article: ".$textDocID." (".$docID.")\n";
for ($i=0; $i<=3; $i++) //skip the date and page number
$word = strtok(" ");
//keep reading words until the next article begins or the file ends
while (strcmp("*TEXT", $word) != 0 && strcmp("*STOP", $word) != 0) {
$cleanedString_Stage1 = preg_replace("/[^a-zA-Z0-9\s]/", "", strtolower($word)); //remove symbols
if ($cleanedString_Stage1 != "") {
$stemmedString = PorterStemmer::Stem($cleanedString_Stage1); //stem the word
$cleanedString_Stage2 = ' '.$stemmedString.' ';
$cleanedString_Stage2 = str_replace($stopwords,"",$cleanedString_Stage2); //remove stopwords
$cleanedString_Stage2 = trim($cleanedString_Stage2);
}
if ($cleanedString_Stage2 != "")
$articleString = $articleString." ".$cleanedString_Stage2;
//do some clean-up before the next article
$cleanedString_Stage1 = "";
$cleanedString_Stage2 = "";
$stemmedString = "";
$word = strtok(" ");
}//end while
//articleString now contains one article
//this line gets the term frequency per article
$article_word_count = (array_count_values(str_word_count($articleString,1,"0..9")));
//create a dictionary containing:
// 1. the term
// 2. the article it appears in
// 3. how many times it appears in that article
foreach ($article_word_count as $key=>$val) {
$dictionary[$index] = array(
"term" => $key,
"docID" => $textDocID,
"freq" => $val
);
$column[] = $dictionary[$index]["Term"]; //needed to sort the dictionary later on
$index++;
}
}
else
$word = strtok(" "); //if article is not found, keep tokenizing
if (strcmp("*STOP", $word) == 0) //break out of loop at eof
break;
}//end while
array_multisort($column, SORT_ASC, $dictionary); //sort dictionary alphabetically
//now that we have an alphabetically sorted dictionary
//we can create the lexicon and postings files
$lexicon = array();
$postings = array();
$pointer = 0; //pointer to postings entry
$myPostings = "postings.txt";
$fhPost = fopen($myPostings,'a');
foreach ($dictionary as $key=>$value) {
$term = $dictionary[$key]["term"];
$docID = $dictionary[$key]["docID"];
$freq = $dictionary[$key]["freq"];
if (isset($lexicon[$term])) {
$lexicon[$term]["numDocs"] += 1;
$lexicon[$term]["totalFreq"] += $freq;
}
else {
$lexicon[$term]["numDocs"] = 1;
$lexicon[$term]["totalFreq"] = $freq;
$lexicon[$term]["pointer"] = $pointer;
}
$postingsData = $docID." ".$freq."\n";
fwrite($fhPost, $postingsData);
$pointer++;
}
fclose($fhPost);
//write lexicon to hd
$myLexicon = "lexicon.txt";
$fhLex = fopen($myLexicon,'a');
foreach ($lexicon as $key=>$value) {
$data = $key." ".$lexicon[$key]["numDocs"]." ".$lexicon[$key]["totalFreq"]." ".$lexicon[$key]["pointer"]."\n";
fwrite($fhLex, $data);
}
fclose($fhLex);
//////
$time_end = microtime(true);
$time = $time_end - $time_start;
echo "\n\nExecution time: ".$time."\n";
?>