I've been working on a serach engine for my site, that can handle HTML and PHP pages. This is just a snippet, which extracts a link, and converts it to an absolute address. You can extend it to extract images as well.
Code: Select all
<?php
$file_name = "http://server/file_name.htm";
$links_array = get_links($file_name);
print("<pre>");
print_r($links_array);
print("</pre>");
//*********************************************//
// array get_links(string file_name)
//
// requires: name of HTML file to read
// returns: array containing relative links on page
//
// limitations: links to be enclosed in double quotes (")
//*********************************************//
function get_links($file_name)
{
// creates array from file
// suppress error if file not found, and skip entry
$file_array = @file($file_name);
// if file() was able to read file, get links from it
if ($file_array != "")
{
// Place array elements in string, seperating by a space
$file_string = implode(" ", $file_array);
// Set-up pattern match for url
$pattern = "/\<a.*href="ї^"]*"/i";
// Store matches in array
// int preg_match_all(string pattern, string subject, array matches)
$success = preg_match_all($pattern, $file_string, $matches);
// preg_match_all returns 2-D array, only need first entry
$matches = $matchesї0];
// remove a href=" and closing "
// mixed preg_replace(mixed pattern, mixed replacement, mixed subject)
$matches = preg_replace("/\<a.*href="/i", "", $matches);
$matches = preg_replace("/"/", "", $matches);
$count_matches = count($matches);
// convert links to absolute
for ($index = 0 ; $index < $count_matches ; $index++)
{
$matchesї$index] = construct_url($file_name, $matchesї$index]);
}
}
return $matches;
}
//*********************************************//
// string construct_url(string base_url, string link)
//
// requires: HTML page to use as base for link
// relative link found on above page
//
// returns: absolute URL
//
// limitations: anchors (#) ignored
// errors using ../ are not reported
// directories must end /
//*********************************************//
function construct_url($base_url, $link_url)
{
// remove anything after #
$link_url = preg_replace("/\#.*/", "", $link_url);
// if link is empty, return base_url
if ($link_url == ""
|| starts_with($link_url, "mailto\:")
|| starts_with($link_url, "javascript\:"))
{
return $base_url;
}
else
{
// if link starts http://, it is a fully specified url
if (starts_with($link_url, "http\:\/\/"))
{
return $link_url;
}
else
{
// if link starts with root directory, append to root
if (starts_with($link_url, "\/"))
{
// Split URL into components
$base_url = parse_url($base_url);
$full_url = $base_urlї"scheme"] . "://"
. $base_urlї"host"]
. $link_url;
return $full_url;
}
else
{
// if link starts with query, append to path
if (starts_with($link_url, "\?"))
{
// Split URL into components
$base_url = parse_url($base_url);
$full_url = $base_urlї"scheme"] . "://"
. $base_urlї"host"];
if ($base_urlї"path"] == "")
{
$full_url .= "/" . $link_url;
}
else
{
$full_url .= $base_urlї"path"] . $link_url;
}
return $full_url;
}
else
{
// if link starts ../ do directory work
if (starts_with($link_url, "\.\.\/"))
{
// Split URL into components
$link_url = parse_url($link_url);
// Count number of levels to move up
$levels = substr_count($link_urlї"path"], "../");
// Remove ../ from beginning of link_url
// starting at 3 * levels (../)
$link_urlї"path"] = substr($link_urlї"path"], $levels*3);
// Split URL into components
$base_url = parse_url($base_url);
// split path at directory seperator (/)
$directory_parts = explode("/", $base_urlї"path"]);
// remove last element
// SHOULD be file name or directory ending slash (/)
array_pop($directory_parts);
// remove (pop) number of directories from base_url
// that is required by link_url
for ($index = 0 ; $index < $levels ; $index++)
{
array_pop($directory_parts);
}
// join elements of base_url directory with seperator (/)
$directory = join("/", $directory_parts);
// Re-construct full_url
$full_url = $base_urlї"scheme"] . "://"
. $base_urlї"host"]
. $directory . "/"
. $link_urlї"path"];
if ($link_urlї"query"] != "")
{
$full_url .= "?" . $link_urlї"query"];
}
if ($link_urlї"fragment"] != "")
{
$full_url .= "#" . $link_urlї"fragment"];
}
return $full_url;
}
else
{
// file in current directory
// remove the first occurance of ./ from link if nessesary
// preg_replace(pattern, replacement, subject, limit)
$link_url = preg_replace("/^\.\//", "", $link_url , 1);
// Split URL into components
$base_url = parse_url($base_url);
$path = $base_urlї"path"];
// split string at directory seperator (/)
$directory_parts = explode("/", $path);
// remove last element
// SHOULD be file name if directory ends slash (/)
array_pop($directory_parts);
// join elements of directory with seperator (/)
$directory = join("/", $directory_parts);
$full_url = $base_urlї"scheme"] . "://"
. $base_urlї"host"]
. $directory . "/"
. $link_url;
return $full_url;
}
}
}
}
}
}
//*********************************************//
// int starts_with(string string, string perl_regular_expression)
//
// requires: string to search
// perl regular expression to search for
//
// returns: 1 if string starts with perl_regular_expression
// NULL otherwise
//
// notes: do not need leading or trailing slash in perl_regular_expression
// special characters must be escaped
//*********************************************//
function starts_with($string, $regex)
{
$pattern = "/(^" .$regex. ")/i";
return preg_match($pattern, $string);
}
//*********************************************//
// int ends_with(string string, string perl_regular_expression)
//
// requires: string to search
// perl regular expression to search for
//
// returns: 1 if string ends with perl_regular_expression
// NULL otherwise
//
// notes: do not need leading or trailing slash in perl_regular_expression
// special characters must be escaped
//*********************************************//
function ends_with($string, $regex)
{
$pattern = "/(" .$regex. "$)/i";
return preg_match($pattern, $string);
}
//*********************************************//
// string escape_string(string string)
//
// requires: string to search
//
// returns: string converted by placing back-slash (\) in front
// of special charachers used in Perl regular expression
//
//*********************************************//
function escape_string($string)
{
return preg_quote($string, "/");
}
?>
