Can anyone give some advice on how to do the following....
Take a HTML document file.
Parse it to extract ALL links, including image references
Build an array that shows.....
1. The type of link
2. The full url for the link
3. The file referenced
4. The file extension / type
Help gratefully received.
Parsing HTML to extract URLs
Moderator: General Moderators
I've been working on a serach engine for my site, that can handle HTML and PHP pages. This is just a snippet, which extracts a link, and converts it to an absolute address. You can extend it to extract images as well.

Code: Select all
<?php
$file_name = "http://server/file_name.htm";
$links_array = get_links($file_name);
print("<pre>");
print_r($links_array);
print("</pre>");
//*********************************************//
// array get_links(string file_name)
//
// requires: name of HTML file to read
// returns: array containing relative links on page
//
// limitations: links to be enclosed in double quotes (")
//*********************************************//
function get_links($file_name)
{
// creates array from file
// suppress error if file not found, and skip entry
$file_array = @file($file_name);
// if file() was able to read file, get links from it
if ($file_array != "")
{
// Place array elements in string, seperating by a space
$file_string = implode(" ", $file_array);
// Set-up pattern match for url
$pattern = "/\<a.*href="ї^"]*"/i";
// Store matches in array
// int preg_match_all(string pattern, string subject, array matches)
$success = preg_match_all($pattern, $file_string, $matches);
// preg_match_all returns 2-D array, only need first entry
$matches = $matchesї0];
// remove a href=" and closing "
// mixed preg_replace(mixed pattern, mixed replacement, mixed subject)
$matches = preg_replace("/\<a.*href="/i", "", $matches);
$matches = preg_replace("/"/", "", $matches);
$count_matches = count($matches);
// convert links to absolute
for ($index = 0 ; $index < $count_matches ; $index++)
{
$matchesї$index] = construct_url($file_name, $matchesї$index]);
}
}
return $matches;
}
//*********************************************//
// string construct_url(string base_url, string link)
//
// requires: HTML page to use as base for link
// relative link found on above page
//
// returns: absolute URL
//
// limitations: anchors (#) ignored
// errors using ../ are not reported
// directories must end /
//*********************************************//
function construct_url($base_url, $link_url)
{
// remove anything after #
$link_url = preg_replace("/\#.*/", "", $link_url);
// if link is empty, return base_url
if ($link_url == ""
|| starts_with($link_url, "mailto\:")
|| starts_with($link_url, "javascript\:"))
{
return $base_url;
}
else
{
// if link starts http://, it is a fully specified url
if (starts_with($link_url, "http\:\/\/"))
{
return $link_url;
}
else
{
// if link starts with root directory, append to root
if (starts_with($link_url, "\/"))
{
// Split URL into components
$base_url = parse_url($base_url);
$full_url = $base_urlї"scheme"] . "://"
. $base_urlї"host"]
. $link_url;
return $full_url;
}
else
{
// if link starts with query, append to path
if (starts_with($link_url, "\?"))
{
// Split URL into components
$base_url = parse_url($base_url);
$full_url = $base_urlї"scheme"] . "://"
. $base_urlї"host"];
if ($base_urlї"path"] == "")
{
$full_url .= "/" . $link_url;
}
else
{
$full_url .= $base_urlї"path"] . $link_url;
}
return $full_url;
}
else
{
// if link starts ../ do directory work
if (starts_with($link_url, "\.\.\/"))
{
// Split URL into components
$link_url = parse_url($link_url);
// Count number of levels to move up
$levels = substr_count($link_urlї"path"], "../");
// Remove ../ from beginning of link_url
// starting at 3 * levels (../)
$link_urlї"path"] = substr($link_urlї"path"], $levels*3);
// Split URL into components
$base_url = parse_url($base_url);
// split path at directory seperator (/)
$directory_parts = explode("/", $base_urlї"path"]);
// remove last element
// SHOULD be file name or directory ending slash (/)
array_pop($directory_parts);
// remove (pop) number of directories from base_url
// that is required by link_url
for ($index = 0 ; $index < $levels ; $index++)
{
array_pop($directory_parts);
}
// join elements of base_url directory with seperator (/)
$directory = join("/", $directory_parts);
// Re-construct full_url
$full_url = $base_urlї"scheme"] . "://"
. $base_urlї"host"]
. $directory . "/"
. $link_urlї"path"];
if ($link_urlї"query"] != "")
{
$full_url .= "?" . $link_urlї"query"];
}
if ($link_urlї"fragment"] != "")
{
$full_url .= "#" . $link_urlї"fragment"];
}
return $full_url;
}
else
{
// file in current directory
// remove the first occurance of ./ from link if nessesary
// preg_replace(pattern, replacement, subject, limit)
$link_url = preg_replace("/^\.\//", "", $link_url , 1);
// Split URL into components
$base_url = parse_url($base_url);
$path = $base_urlї"path"];
// split string at directory seperator (/)
$directory_parts = explode("/", $path);
// remove last element
// SHOULD be file name if directory ends slash (/)
array_pop($directory_parts);
// join elements of directory with seperator (/)
$directory = join("/", $directory_parts);
$full_url = $base_urlї"scheme"] . "://"
. $base_urlї"host"]
. $directory . "/"
. $link_url;
return $full_url;
}
}
}
}
}
}
//*********************************************//
// int starts_with(string string, string perl_regular_expression)
//
// requires: string to search
// perl regular expression to search for
//
// returns: 1 if string starts with perl_regular_expression
// NULL otherwise
//
// notes: do not need leading or trailing slash in perl_regular_expression
// special characters must be escaped
//*********************************************//
function starts_with($string, $regex)
{
$pattern = "/(^" .$regex. ")/i";
return preg_match($pattern, $string);
}
//*********************************************//
// int ends_with(string string, string perl_regular_expression)
//
// requires: string to search
// perl regular expression to search for
//
// returns: 1 if string ends with perl_regular_expression
// NULL otherwise
//
// notes: do not need leading or trailing slash in perl_regular_expression
// special characters must be escaped
//*********************************************//
function ends_with($string, $regex)
{
$pattern = "/(" .$regex. "$)/i";
return preg_match($pattern, $string);
}
//*********************************************//
// string escape_string(string string)
//
// requires: string to search
//
// returns: string converted by placing back-slash (\) in front
// of special charachers used in Perl regular expression
//
//*********************************************//
function escape_string($string)
{
return preg_quote($string, "/");
}
?>