<?php
/*****/
/*
Written by: Aziz S. Hussain
Email: azizsaleh@gmail.com
Website: www.azizsaleh.com
Produced under GPL License
*/
/*****/
/*
Email address scraper based on a URL.
*/
class scraper
{
// URL that stores first URL to start
var $startURL;
// List of allowed page extensions
var $allowedExtensions = array('.css','.xml','.rss','.ico','.js','.gif','.jpg','.jpeg','.png','.bmp','.wmv'
,'.avi','.mp3','.flash','.swf','.css');
// Which URL to scrape
var $useURL;
// Start path, for links that are relative
var $startPath;
// Set start path
function setStartPath($path = NULL){
if($path != NULL)
{
$this->startPath = $path;
} else {
$temp = explode('/',$this->startURL);
$this->startPath = $temp[0].'//'.$temp[2];
}
}
// Add the start URL
function startURL($theURL){
// Set start URL
$this->startURL = $theURL;
}
// Function to get URL contents
function getContents($url)
{
$ch = curl_init(); // initialize curl handle
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_VERBOSE, 0);
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible;)");
curl_setopt($ch, CURLOPT_AUTOREFERER, false);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT,7);
curl_setopt($ch, CURLOPT_REFERER, 'http://'.$this->useURL);
curl_setopt($ch, CURLOPT_URL,$url); // set url to post to
curl_setopt($ch, CURLOPT_FAILONERROR, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);// allow redirects
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1); // return into a variable
curl_setopt($ch, CURLOPT_TIMEOUT, 50); // times out after 50s
curl_setopt($ch, CURLOPT_POST, 0); // set POST method
$buffer = curl_exec($ch); // run the whole process
curl_close($ch);
return $buffer;
}
// Actually do the URLS
function startScraping()
{
// Get page content
$pageContent = $this->getContents($this->startURL);
echo 'Scraping URL: '.$this->startURL.PHP_EOL;
// Get list of all emails on page
preg_match_all('/([\w+\.]*\w+@[\w+\.]*\w+[\w+\-\w+]*\.\w+)/is',$pageContent,$results);
// Add the email to the email list array
$insertCount=0;
foreach($results[1] as $curEmail)
{
$insert = mysql_query("INSERT INTO `emaillist` (`emailadd`) VALUES ('$curEmail')");
if($insert){$insertCount++;}
}
echo 'Emails found: '.number_format($insertCount).PHP_EOL;
// Mark the page done
$insert = mysql_query("INSERT INTO `finishedurls` (`urlname`) VALUES ('".$this->startURL."')");
// Get list of new page URLS is emails were found on previous page
preg_match_all('/href="([^"]+)"/Umis',$pageContent,$results);
$currentList = $this->cleanListURLs($results[1]);
$insertURLCount=0;
// Add the list to the array
foreach($currentList as $curURL)
{
$insert = mysql_query("INSERT INTO `workingurls` (`urlname`) VALUES ('$curURL')");
if($insert){$insertURLCount++;}
}
echo 'URLs found: '.number_format($insertURLCount).PHP_EOL;
$getURL = mysql_fetch_assoc(mysql_query("SELECT `urlname` FROM `workingurls` ORDER BY RAND() LIMIT 1"));
$remove = mysql_query("DELETE FROM `workingurls` WHERE `urlname`='$getURL[urlname]' LIMIT 1");
// Get the new page ready
$this->startURL = $getURL['urlname'];
$this->setStartPath();
// If no more pages, return
if($this->startURL == NULL){ return;}
// Clean vars
unset($results,$pageContent);
// If more pages, loop again
$this->startScraping();
}
// Function to clean input URLS
function cleanListURLs($linkList)
{
foreach($linkList as $sub => $url)
{
// Check if only 1 character - there must exist at least / character
if(strlen($url) <= 1){unset($linkList[$sub]);}
// Check for any javascript
if(eregi('javascript',$url)){unset($linkList[$sub]);}
// Check for invalid extensions
str_replace($this->allowedExtensions,'',$url,$count);
if($count > 0){ unset($linkList[$sub]);}
// If URL starts with #, ignore
if(substr($url,0,1) == '#'){unset($linkList[$sub]);}
// If everything is OK and path is relative, add starting path
if(substr($url,0,1) == '/' || substr($url,0,1) == '?' || substr($url,0,1) == '='){
$linkList[$sub] = $this->startPath.$url;
}
}
return $linkList;
}
}
?>
|