When the spider visits a page, this code is supposed to try to find a <pre> tag in it. if it does, it adds it to the database.
Code: Select all
<?php
if (preg_match("/<pre/i", "<pre>)) {
include("_dbcon.inc.php");
$con=mysql_connect($server,$dbusr,$dbpas)
or die($error["nomysql"]);
mysql_select_db("$db");
$tab=$table[0][0];
$query="INSERT $tab VALUES ('$url', '$tit', '$kw', '$des','','');";
mysql_query($query);
if (mysql_errno()!=0)
{
$query="UPDATE $table SET tit='$tit',kw='$kw',des='$des' where url='$url';";
}
mysql_query($query);
mysql_close($con);
} else {
}
?>here is the spider (spider.php):
Code: Select all
<?php
error_reporting(63); //This is still experimentation, so better show all error
define('VALID_EXT','(\.htm|\.html|\.php|\.php3|/)'); //Those are the links extension that we will follow
$link_todo = array(); //list of links we have to process
$link_done = array(); //list of links which have been processed
require('url.inc'); // We need some url related functions
require('t_socket.inc'); // We need DQ's turbo socket to connect to the world
//class user agent. Ask him to fetch a page for you
class ua {
var $RC = 0; //HTTP return code
var $RCString = ''; //HTTP return string
var $Header=''; //Header array
var $Content=''; //Page content (string)
var $name = 'TabSearch.tk Spider';
function url($ua='')
{
if (isset($ua)&& !empty($ua)) $this->$name = $ua;
}
function request($url,$headonly=false)
{
if (($url_array=parse_url($url)) !=FALSE) {
if (!isset($url_array['path'])) $url_array['path'] = '/';
$f=new socket($url_array['host'],80,10,10,1);
if (!($f->eof())) {
$f->write((($headonly)?"HEAD":"GET")." $url_array[path] HTTP/1.1\r\n" );
$f->write("Host: $url_array[host]\r\n" );
$f->write("User-Agent: ".$this->name."\r\n" );
$f->write("Accept: */*\r\n" );
$f->write( "\r\n" );
if (!($f->eof())) { $Status = $f->readline();}
if (preg_match('@(\d{3}) (.*)@',$Status,$reg_array)) {
$this->RC = $reg_array[1];
$this->RCString = $reg_array[2];
}
while(!($f->eof())) {
$line = $f->readline();
if ($line=="\r\n") break;
list($name, $val) = explode(': ',$line);
if (empty($val)) {$val = 'N/A';}
$this->Header[strtolower($name)]=trim($val);
}
$this->Content="";
while (!($f->eof())) { $this->Content.=$f->readline();}
$f->close();
} else {
$this->RC = -1;
$this->RCString = $f->errstring;
}
} else {
$this->RC=-1;
$this->RCString='malformed URL';
}
}
//Return a specific header content
function GetHeader($name)
{
if (isset($this->Header[$name])) return $this->Header[$name];
else return '';
}
}; //end class ua
//This function must decide if a link is worth following.
//Currently all it does is to keep only http links (not mailto, not ftp, ...) pointing to VALID_EXT file (htm, html, php, ...)
//that are located on the same server as the starting url
//It also checks if a link wasn't already processed
//and finally it checks if the link isn't already on the to_follow list on a higher level
function checklink($base, $link, $level)
{
global $link_done;
global $link_todo;
extract(parse_url($link));
extract(parse_url($base), EXTR_PREFIX_ALL, "B");
if ($scheme!="http") return FALSE; //Keep only http links
if (!preg_match('!'.VALID_EXT.'$!i', $path)) return FALSE; //Keep only valid extension
if ($host != $B_host) return FALSE; //Keep only urls on the base server
reset($link_done);
while (list(,$a_link) = each($link_done)) { //Check if link wasn't already visited
if ($link==$a_link) return FALSE;
}
reset($link_todo);
while (list(,$a_link) = each($link_todo)) { //Check if link isn't on to_do list of links
if ($link==$a_link['url']) {
//let's check if this link is not on a higher level
if ($a_link['level']>$level+1) $a_link['level'] = $level+1;
//link already in list, so return false
return FALSE;
}
}
return TRUE;
}
//This function remove fragment and query part from an url
function filter_link($link)
{
$array = parse_url($link);
if (isset($array['fragment'])) unset($array['fragment']);
if (isset($array['query'])) unset($array['query']);
return compose_url($array);
}
//move first link to follow on top of array depending on searching method
function sort_link_todo($a, $b)
{
global $method;
if (!is_array($a) || !is_array($b)) return 0; //patch by dq
switch($method) {
case 'BFS' : {
if ($a['level'] == $b['level']) return 0;
return ($a['level'] > $b['level']) ? 1 : -1;
break;
}
case 'DFS' :
default : {
if ($a['level'] == $b['level']) return 0;
return ($a['level'] > $b['level']) ? -1 : 1;
break;
}
}
}
function crawl()
{
global $link_done;
global $link_todo;
global $maxlevel;
global $start_time;
global $timeout;
while ((count($link_todo)>0) && (time()-$start_time < $timeout)) {
usort($link_todo, 'sort_link_todo'); //sort list of links to do based on current crawl method
reset($link_todo);
list($idx,$link) = each($link_todo); //get top link to do
unset($link_todo[$idx]); //remove it from list
extract($link); // Convert the $link into a $url and a $level variable
$link_done[]=$url; //add this page to the array of already processed pages
echo("fetching $url<br>");
$agent = new ua();
$agent->request($url); //Ask our agent to fetch the page for us
echo("return code : ".$agent->RC."<br>");
if ($agent->RC == '200') { //200 OK Result ?
$Content = $agent->Content; //Let's work on a copy of the page content
//!!!!!!!!!!! Here we can process the page content !!!!!!!!!!!!!!!!!!
//Now we will extract all links for the fetched page
if (($maxlevel==0) || ($level<$maxlevel)) { //only if we didn't reach the max level of deepness
// Loop through all links, construct a links array
while (preg_match('!HREF="([^"]*)"!i', $Content, $reg_array)) {
$link=$reg_array[1]; //Get a link
$replace = preg_quote($reg_array[0]); //Remove it from the page content so we won't keep processing the same link
$Content=preg_replace('!'.$replace.'!', '', $Content);
$link=absolute_url($url, $link); //Make it absolute regarding the current page
$link=filter_link($link); //Remove any fragment or query part
if (checklink($url,$link,$level)) { //check if link is worth checking and eventually add it to the list of links to do
$alink['url'] = $link;
$alink['level'] = $level+1;
$link_todo[]=$alink;
}
}
}
}
else print("<b>cannot reach link : $url / Return error = ".$agent->RCString."</b><BR>");
flush();
}
}
//process _GET param
//get_param('maxlevel',3); // How deep shall we crawl
//get_param('timeout',5*60); //default time out = 5min
//get_param('method','DFS'); //Which way to follow
//@get_param('start_url'); // starting url
//@ is needed to avoid warning because of missing second parameter
//TEST
$start_url = ($_GET['start_url']);
$maxlevel = ($_GET['maxlevel']);
$timeout = ($_GET['timeout']);
$method = ($_GET['method']);
//END TEST
//if (!isset(maxlevel)) $maxlevel=3;
//if (!isset($timeout)) $timeout = 5*60;
//if (!isset($method)) $method = 'DFS';
if (!isset($start_url)) $start_url = 'http://www.searchlores.org/';
else {
$abs_url = absolute_url($start_url, $start_url);
print("absolute url = $abs_url<BR>");
flush();
$start_time = time();
$link_todo[] = array('url'=>$start_url, 'level'=>0);
crawl($abs_url);
$delta = time()-$start_time;
printf("search took : %d sec<BR>",$delta);
print("total processed pages : ".count($link_done)."<BR>");
}
?>