PHP Code:
<?php
// ********** FUNCTIONS **********
// http://www.merchantos.com/makebeta/php/scraping-links-with-php/#put_together
function storeLink( $url,$gathered_from ) {
$query = "INSERT INTO links ( url, gathered_from ) VALUES ( '$url', '$gathered_from' )";
mysql_query($query) or die( 'Error: Main INSERT query failed' );
}
// ********** Initial settings **********
// Database variables
$database = '';
$username = '';
$host = 'localhost';
$password = '';
// Set the user agent as some servers will error without one
$userAgent = 'Googlebot/2.1 (http://www.googlebot.com/bot.html)';
// Array of pages to parse
$page = array( '', 'art.php', 'other.php', 'resize.php', 'watermark.php', 'mosaic.php', 'text.php', 'server/server.php', 'codes/codes.php', 'notes/notes.php' );
// Start url
$start_page = "http://www.rubblewebs.co.uk/imagemagick/";
// ********** Start the code **********
// Connect to the database using the details entered into the variable above
$conn = mysql_connect( "$host", "$username", "$password" );
// If the connection can not be made print Could not connect MySQL
if ( !$conn ) die ( "Could not connect to MySQL server" );
// If the database could not be opened or found print Could not open database
mysql_select_db( $database,$conn ) or die ( "Could not open database" );
// Start off by emptying the database
$query = "TRUNCATE TABLE links";
mysql_query( $query ) or die( 'Error: TRUNCATE query failed' );
// Read the pages from the array finding the links
foreach( $page as $value ){
$target_url = $start_page.$value;
// make the cURL request to $target_url
$ch = curl_init();
curl_setopt( $ch, CURLOPT_USERAGENT, $userAgent );
curl_setopt( $ch, CURLOPT_URL,$target_url );
curl_setopt( $ch, CURLOPT_FAILONERROR, true );
curl_setopt( $ch, CURLOPT_FOLLOWLOCATION, true );
curl_setopt( $ch, CURLOPT_AUTOREFERER, true );
curl_setopt( $ch, CURLOPT_RETURNTRANSFER,true );
curl_setopt( $ch, CURLOPT_TIMEOUT, 10 );
$html= curl_exec( $ch );
if ( !$html ) {
echo "<br />cURL error number:" .curl_errno( $ch );
echo "<br />cURL error:" . curl_error( $ch );
exit;
}
// parse the html into a DOMDocument
$dom = new DOMDocument();
@$dom->loadHTML( $html );
// grab all the links on the page
$xpath = new DOMXPath( $dom );
$hrefs = $xpath->evaluate( "/html/body//a" );
// Save the links etc. into the database
for ( $i = 0; $i < $hrefs->length; $i++ ) {
$href = $hrefs->item( $i );
$url = $href->getAttribute( 'href' );
storeLink( $url,$target_url );
}
}
// http://www.justin-cook.com/wp/2006/12/12/remove-duplicate-entries-rows-a-mysql-database-table/
// Remove duplicate data based on the url column
// Create a new table with the data from the current table without the duplicates
mysql_query( "CREATE TABLE temp_table AS SELECT * FROM links WHERE 1 GROUP BY url" )
or die( 'Error: CREATE TABLE failed'.mysql_error() );
// Delete the first table
mysql_query( "DROP TABLE links" )
or die( 'DROP TABLE failed'.mysql_error() );
// Rename the new table to the original name
mysql_query( "RENAME TABLE temp_table TO links" )
or die( 'RENAME TABLE failed'.mysql_error() );
// Get the title from the pages put into the database by the original curl code
$query = "SELECT url FROM links";
$returned = mysql_query( $query ) or die( 'Error: SELECT url query failed' );
// Loop through all the urls getting the title tag from each page then saving it with the relavent url in the database
while ( $row = mysql_fetch_array ( $returned ))
{
$file = $row['url'];
// Get the page titles
$ch = curl_init();
curl_setopt( $ch, CURLOPT_URL, $start_page.$file );
curl_setopt( $ch, CURLOPT_USERAGENT, $userAgent );
curl_setopt( $ch, CURLOPT_RETURNTRANSFER,1 );
$result = curl_exec ( $ch );
curl_close ( $ch );
preg_match( '#<title>(.+?)</title>#i', $result, $matches );
// Update the rows with the page title
$query = "UPDATE links SET title = '$matches[1]' WHERE url = '$file'";
mysql_query( $query ) or die( 'Error: Update query failed'.mysql_error() );
}
// Code to sort out the pages that returned a 404 error. This was caused by the files with links inside a folder e.g server/server.php
$query = "SELECT * FROM links ORDER BY title";
$returned = mysql_query( $query ) or die( 'Error, SELECT query for 404 error failed' );
while ( $row = mysql_fetch_array ( $returned ) )
{
// Only select the results that have a 404 error as the title
if ( $row['title'] == 'Error 404 page' ){
$location = $row['gathered_from'];
$exploded = explode( "/", $row['gathered_from'] );
// Count the number of parts there are when $row['gathered_from'] is exploded
$count = count( $exploded );
$last_item = $exploded[$count-1];
$target = str_replace( $last_item, '', $location );
$path = $target.$row['url'];
$file = $row['url'];
// Get the page titles
$ch = curl_init();
curl_setopt( $ch, CURLOPT_URL, $path );
curl_setopt( $ch, CURLOPT_USERAGENT, $userAgent );
curl_setopt( $ch, CURLOPT_RETURNTRANSFER,1 );
$result = curl_exec ( $ch );
curl_close ( $ch );
preg_match( '#<title>(.+?)</title>#i', $result, $matches );
// Update the rows with the page title
$query = "UPDATE links SET title = '$matches[1]' WHERE url = '$file'";
mysql_query( $query ) or die( 'Error: Update the 404 error query failed' );
}
}
?>
Bookmarks