Spider Help
Posted: Sun Sep 26, 2004 6:44 pm
I found the framework for a php search engine spider on the internet because i have been trying to make my own. I have written the rest except i can't seem to be able to find where to put the rest (i am not that good at php.) Can someone show me where i can put the following piece of code? I also need to be able to take the first few lines of each page as a description, keywords and the title and put them in variables so they can be added to the database (the variables are in the mysql insert string)
When the spider visits a page, this code is supposed to try to find a <pre> tag in it. if it does, it adds it to the database.
and I need to put that in the spider script so that when the spider opens the page it can search for that tag and add the page to the database if needed. and how to get the description, etc.
here is the spider (spider.php):
When the spider visits a page, this code is supposed to try to find a <pre> tag in it. if it does, it adds it to the database.
Code: Select all
<?php
if (preg_match("/<pre/i", "<pre>)) {
include("_dbcon.inc.php");
$con=mysql_connect($server,$dbusr,$dbpas)
or die($error["nomysql"]);
mysql_select_db("$db");
$tab=$table[0][0];
$query="INSERT $tab VALUES ('$url', '$tit', '$kw', '$des','','');";
mysql_query($query);
if (mysql_errno()!=0)
{
$query="UPDATE $table SET tit='$tit',kw='$kw',des='$des' where url='$url';";
}
mysql_query($query);
mysql_close($con);
} else {
}
?>here is the spider (spider.php):
Code: Select all
<?php
error_reporting(63); //This is still experimentation, so better show all error
define('VALID_EXT','(\.htm|\.html|\.php|\.php3|/)'); //Those are the links extension that we will follow
$link_todo = array(); //list of links we have to process
$link_done = array(); //list of links which have been processed
require('url.inc'); // We need some url related functions
require('t_socket.inc'); // We need DQ's turbo socket to connect to the world
//class user agent. Ask him to fetch a page for you
class ua {
var $RC = 0; //HTTP return code
var $RCString = ''; //HTTP return string
var $Header=''; //Header array
var $Content=''; //Page content (string)
var $name = 'TabSearch.tk Spider';
function url($ua='')
{
if (isset($ua)&& !empty($ua)) $this->$name = $ua;
}
function request($url,$headonly=false)
{
if (($url_array=parse_url($url)) !=FALSE) {
if (!isset($url_array['path'])) $url_array['path'] = '/';
$f=new socket($url_array['host'],80,10,10,1);
if (!($f->eof())) {
$f->write((($headonly)?"HEAD":"GET")." $url_array[path] HTTP/1.1\r\n" );
$f->write("Host: $url_array[host]\r\n" );
$f->write("User-Agent: ".$this->name."\r\n" );
$f->write("Accept: */*\r\n" );
$f->write( "\r\n" );
if (!($f->eof())) { $Status = $f->readline();}
if (preg_match('@(\d{3}) (.*)@',$Status,$reg_array)) {
$this->RC = $reg_array[1];
$this->RCString = $reg_array[2];
}
while(!($f->eof())) {
$line = $f->readline();
if ($line=="\r\n") break;
list($name, $val) = explode(': ',$line);
if (empty($val)) {$val = 'N/A';}
$this->Header[strtolower($name)]=trim($val);
}
$this->Content="";
while (!($f->eof())) { $this->Content.=$f->readline();}
$f->close();
} else {
$this->RC = -1;
$this->RCString = $f->errstring;
}
} else {
$this->RC=-1;
$this->RCString='malformed URL';
}
}
//Return a specific header content
function GetHeader($name)
{
if (isset($this->Header[$name])) return $this->Header[$name];
else return '';
}
}; //end class ua
//This function must decide if a link is worth following.
//Currently all it does is to keep only http links (not mailto, not ftp, ...) pointing to VALID_EXT file (htm, html, php, ...)
//that are located on the same server as the starting url
//It also checks if a link wasn't already processed
//and finally it checks if the link isn't already on the to_follow list on a higher level
function checklink($base, $link, $level)
{
global $link_done;
global $link_todo;
extract(parse_url($link));
extract(parse_url($base), EXTR_PREFIX_ALL, "B");
if ($scheme!="http") return FALSE; //Keep only http links
if (!preg_match('!'.VALID_EXT.'$!i', $path)) return FALSE; //Keep only valid extension
if ($host != $B_host) return FALSE; //Keep only urls on the base server
reset($link_done);
while (list(,$a_link) = each($link_done)) { //Check if link wasn't already visited
if ($link==$a_link) return FALSE;
}
reset($link_todo);
while (list(,$a_link) = each($link_todo)) { //Check if link isn't on to_do list of links
if ($link==$a_link['url']) {
//let's check if this link is not on a higher level
if ($a_link['level']>$level+1) $a_link['level'] = $level+1;
//link already in list, so return false
return FALSE;
}
}
return TRUE;
}
//This function remove fragment and query part from an url
function filter_link($link)
{
$array = parse_url($link);
if (isset($array['fragment'])) unset($array['fragment']);
if (isset($array['query'])) unset($array['query']);
return compose_url($array);
}
//move first link to follow on top of array depending on searching method
function sort_link_todo($a, $b)
{
global $method;
if (!is_array($a) || !is_array($b)) return 0; //patch by dq
switch($method) {
case 'BFS' : {
if ($a['level'] == $b['level']) return 0;
return ($a['level'] > $b['level']) ? 1 : -1;
break;
}
case 'DFS' :
default : {
if ($a['level'] == $b['level']) return 0;
return ($a['level'] > $b['level']) ? -1 : 1;
break;
}
}
}
function crawl()
{
global $link_done;
global $link_todo;
global $maxlevel;
global $start_time;
global $timeout;
while ((count($link_todo)>0) && (time()-$start_time < $timeout)) {
usort($link_todo, 'sort_link_todo'); //sort list of links to do based on current crawl method
reset($link_todo);
list($idx,$link) = each($link_todo); //get top link to do
unset($link_todo[$idx]); //remove it from list
extract($link); // Convert the $link into a $url and a $level variable
$link_done[]=$url; //add this page to the array of already processed pages
echo("fetching $url<br>");
$agent = new ua();
$agent->request($url); //Ask our agent to fetch the page for us
echo("return code : ".$agent->RC."<br>");
if ($agent->RC == '200') { //200 OK Result ?
$Content = $agent->Content; //Let's work on a copy of the page content
//!!!!!!!!!!! Here we can process the page content !!!!!!!!!!!!!!!!!!
//Now we will extract all links for the fetched page
if (($maxlevel==0) || ($level<$maxlevel)) { //only if we didn't reach the max level of deepness
// Loop through all links, construct a links array
while (preg_match('!HREF="([^"]*)"!i', $Content, $reg_array)) {
$link=$reg_array[1]; //Get a link
$replace = preg_quote($reg_array[0]); //Remove it from the page content so we won't keep processing the same link
$Content=preg_replace('!'.$replace.'!', '', $Content);
$link=absolute_url($url, $link); //Make it absolute regarding the current page
$link=filter_link($link); //Remove any fragment or query part
if (checklink($url,$link,$level)) { //check if link is worth checking and eventually add it to the list of links to do
$alink['url'] = $link;
$alink['level'] = $level+1;
$link_todo[]=$alink;
}
}
}
}
else print("<b>cannot reach link : $url / Return error = ".$agent->RCString."</b><BR>");
flush();
}
}
//process _GET param
//get_param('maxlevel',3); // How deep shall we crawl
//get_param('timeout',5*60); //default time out = 5min
//get_param('method','DFS'); //Which way to follow
//@get_param('start_url'); // starting url
//@ is needed to avoid warning because of missing second parameter
//TEST
$start_url = ($_GET['start_url']);
$maxlevel = ($_GET['maxlevel']);
$timeout = ($_GET['timeout']);
$method = ($_GET['method']);
//END TEST
//if (!isset(maxlevel)) $maxlevel=3;
//if (!isset($timeout)) $timeout = 5*60;
//if (!isset($method)) $method = 'DFS';
if (!isset($start_url)) $start_url = 'http://www.searchlores.org/';
else {
$abs_url = absolute_url($start_url, $start_url);
print("absolute url = $abs_url<BR>");
flush();
$start_time = time();
$link_todo[] = array('url'=>$start_url, 'level'=>0);
crawl($abs_url);
$delta = time()-$start_time;
printf("search took : %d sec<BR>",$delta);
print("total processed pages : ".count($link_done)."<BR>");
}
?>