News grabber.. can some1 help?
Posted: Tue Jan 14, 2003 6:39 am
Hey i made a news grabber script that take news from a F1 site.
Only i've problems with the "base url"
Instead of http://f1.racing.com/en/headlines it displays http://gpfanatics.com/headlines
You can see the script in action here: http://www.gpfanatics.com/f1news.php
Can some1 help me?
Ciao
<?
$filename = "http://f1.racing-live.com/en/index.shtml"; // Location of the News Source
$start = "<!-- FIL DE L'INFO -->"; // Start Grabbing Code
$stop = "<!-- BOXES -->"; // Stop Grabbing Code
// Get contents of the specified URL and writes it into a string
$fd = fopen( $filename, "r" );
$contents = fread( $fd, 200000 );
fclose( $fd );
// Isolates desired section.
if(eregi("$start(.*)$stop", $contents, $printing)) {
$substring=$printing[1];
// while is added as there are multiple instances of the </table> string & eregi
// searches to include the most that matches, not the next.
while(eregi("(.*)$stop", $substring, $printing)) {
$substring=$printing[1];
};
} else {
echo "Didn't find Daily summary";
}
// Replaces specific HTML tags and junk text
$printing[1] = eregi_replace( "- .* records</b>", "", $printing[1] ); // Text
$printing[1] = eregi_replace( "<IMG SRC=[^>]*>", "", $printing[1] ); // Images
$printing[1] = eregi_replace( "<font[^>]*>", "", $printing[1] ); // Fonts
$printing[1] = eregi_replace( "</font>", "", $printing[1] );
$printing[1] = eregi_replace( "<tr[^>]*>", "<li>", $printing[1] ); // Table Codes
$printing[1] = eregi_replace( "<td[^>]*>", "", $printing[1] );
$printing[1] = eregi_replace( "</tr>", "", $printing[1] );
$printing[1] = eregi_replace( "</td>", "", $printing[1] );
$printing[1] = eregi_replace( "<li>", "", $printing[1] );
$printing[1] = str_replace("HREF=\"", "href=\"http://f1.racing-live.com/en/", "$printing[1]"); // Adds Absolute URL
$printing[1] = eregi_replace("HREF=\"", "href=\"http://f1.racing-live.com/en/", "$printing[1]");
$printing[1] = eregi_replace( "\">", "\" target=\"_blank\" >", "$printing[1]");
$printing[1] = eregi_replace( " ", "", "$printing[1]");
$printing[1] = eregi_replace( " ", "", "$printing[1]");
$printing[1] = eregi_replace( " ", "", "$printing[1]");
$printing[1] = eregi_replace( " ", "", "$printing[1]");
$printing[1] = eregi_replace( " <br>", "", "$printing[1]");
$printing[1] = eregi_replace("<span class=\"infos\"", "<br><span class=\"downloadlink\"", "$printing[1]");
$printing[1] = ereg_replace("[ \r\n\f\t\v]+\\|[ \r\n\f\t\v]+", "", $printing[1]);
$printing[1] = preg_replace ("'([\r\n])[\s]+'", "\\1", $printing[1]); # replaces all junk strings and linebreaks
echo $printing[1];
?>
Only i've problems with the "base url"
Instead of http://f1.racing.com/en/headlines it displays http://gpfanatics.com/headlines
You can see the script in action here: http://www.gpfanatics.com/f1news.php
Can some1 help me?
Ciao
<?
$filename = "http://f1.racing-live.com/en/index.shtml"; // Location of the News Source
$start = "<!-- FIL DE L'INFO -->"; // Start Grabbing Code
$stop = "<!-- BOXES -->"; // Stop Grabbing Code
// Get contents of the specified URL and writes it into a string
$fd = fopen( $filename, "r" );
$contents = fread( $fd, 200000 );
fclose( $fd );
// Isolates desired section.
if(eregi("$start(.*)$stop", $contents, $printing)) {
$substring=$printing[1];
// while is added as there are multiple instances of the </table> string & eregi
// searches to include the most that matches, not the next.
while(eregi("(.*)$stop", $substring, $printing)) {
$substring=$printing[1];
};
} else {
echo "Didn't find Daily summary";
}
// Replaces specific HTML tags and junk text
$printing[1] = eregi_replace( "- .* records</b>", "", $printing[1] ); // Text
$printing[1] = eregi_replace( "<IMG SRC=[^>]*>", "", $printing[1] ); // Images
$printing[1] = eregi_replace( "<font[^>]*>", "", $printing[1] ); // Fonts
$printing[1] = eregi_replace( "</font>", "", $printing[1] );
$printing[1] = eregi_replace( "<tr[^>]*>", "<li>", $printing[1] ); // Table Codes
$printing[1] = eregi_replace( "<td[^>]*>", "", $printing[1] );
$printing[1] = eregi_replace( "</tr>", "", $printing[1] );
$printing[1] = eregi_replace( "</td>", "", $printing[1] );
$printing[1] = eregi_replace( "<li>", "", $printing[1] );
$printing[1] = str_replace("HREF=\"", "href=\"http://f1.racing-live.com/en/", "$printing[1]"); // Adds Absolute URL
$printing[1] = eregi_replace("HREF=\"", "href=\"http://f1.racing-live.com/en/", "$printing[1]");
$printing[1] = eregi_replace( "\">", "\" target=\"_blank\" >", "$printing[1]");
$printing[1] = eregi_replace( " ", "", "$printing[1]");
$printing[1] = eregi_replace( " ", "", "$printing[1]");
$printing[1] = eregi_replace( " ", "", "$printing[1]");
$printing[1] = eregi_replace( " ", "", "$printing[1]");
$printing[1] = eregi_replace( " <br>", "", "$printing[1]");
$printing[1] = eregi_replace("<span class=\"infos\"", "<br><span class=\"downloadlink\"", "$printing[1]");
$printing[1] = ereg_replace("[ \r\n\f\t\v]+\\|[ \r\n\f\t\v]+", "", $printing[1]);
$printing[1] = preg_replace ("'([\r\n])[\s]+'", "\\1", $printing[1]); # replaces all junk strings and linebreaks
echo $printing[1];
?>