<?php
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* (C) Jonathan Schmidt-Dominé 2008-2012 < devel@the-user.org >
*/
/*
* Retrieves Google web search results
*/
class GoogleCrawler
{
public $content;
public $results;
public $numresults;
private $usebase64;
private static $glypeProxies = array(
'2proxy.info/browse.php',
'flation.info/browse.php',
'orbing.info/browse.php',
'voye.info/browse.php',
'iners.info/browse.php',
'horted.info/browse.php',
'omzeil.com/browse.php',
'byewall.com/browse.php',
'iplama.com/browse.php',
'bouncevia.nl/browse.php'
);
private static $base64required = array(
true,
false,
false,
false,
false,
false,
true,
true,
true,
true
);
private static $glypeMaxTries = 5;
public static function getUrl($url)
{
if(substr($url, 0, 7) == 'http://')
$url = substr($url, 7);
$slashPos = strpos($url, '/');
$host = substr($url, 0, $slashPos);
$path = substr($url, $slashPos);
// echo 'Url: ' . $url . "\n";
// echo 'Host: ' . $host . "\n";
// echo 'Path: ' . $path . "\n";
$fp = fsockopen($host, 80);
stream_set_timeout($fp, 5);
if(!$fp)
return false;
fputs($fp, "GET $path HTTP/1.1\n");
fputs($fp, "Host: $host\n");
fputs($fp, "User-Agent: Mozilla/5.0 (X11; U; Linux i686; de; rv:1.9.0.3) Gecko/2008092700 Firefox/3.0.3\n");
fputs($fp, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\n");
fputs($fp, "Referer: http://$url\n");
fputs($fp, "Cookie: \n");
fputs($fp, "Keep-Alive: 115\n");
fputs($fp, "Connection: keep-alive\n\n");
$c = false;
while(!feof($fp))
{
if(($tmp = fgets($fp, 128)) == "\n" || $tmp == "\r\n")
{
break;
}
else if(substr($tmp, 0, 9) == 'Location:')
{
fclose($fp);
// echo "Redirect: " . $tmp . "\n";
return self::getUrl(trim(substr($tmp, 9)));
}
}
ob_start();
fpassthru($fp);
$info = stream_get_meta_data($fp);
fclose($fp);
if($info['timed_out'])
{
ob_clean();
return false;
}
return ob_get_clean();
}
private static function urldecode1(array $results)
{
return urldecode($results[1]);
}
private static function removeGoogleSpyingUrl($url)
{
return preg_replace_callback('~^http://(?:www\.)?google\.[a-zA-Z]+/url\?q=([^&]*).*$~', array(__CLASS__, 'urldecode1'), $url);
}
private function base64Callback(array $match)
{
$tmp = urldecode(html_entity_decode($match[1]));
if($this->usebase64)
$tmp = base64_decode($tmp);
return 'href="' . self::removeGoogleSpyingUrl(($this->usebase64 ? 'http' : '') . htmlspecialchars($tmp)) . '"';
}
public function getUrlViaProxy($url)
{
for($i = 0; $i != self::$glypeMaxTries; ++$i)
{
$id = rand() % count(self::$glypeProxies);
$proxy = self::$glypeProxies[$id];
$this->usebase64 = self::$base64required[$id];
echo 'Proxy: ' . htmlspecialchars($proxy);
$data = self::getUrl($proxy . "?u=" . urlencode($url) . ($this->usebase64 ? "&b=25" : "&b=0"));
if($data)
return preg_replace_callback('~href="http://' . $proxy . '\?u=([^&"]*)[^"]*"~', array($this, 'base64Callback'), $data);
}
throw new Exception("Can't get any Google-Results");
}
// <h2 class="r"><a class="l" href="{url-proxy-encoded}" class="l" >{title}</a>(…stuff…)<div class="std"><span class="s">{description}</span>…<span class="a">{url} - </span><nobr><a href="{cache-url-encoded}" class="fl"(…don't care…)s
public function __construct($keywordsGot, $pageNum = 1, $number = 10, $lang = 'en', $googleurl = 'http://www.google.com/cse')
{
$keywords=$keywordsGot;
if($pageNum==0)
$this->content=$this->getUrlViaProxy($googleurl . "?q=".urlencode($keywords) . "&num=" . $number . "&ie=UTF-8&oe=UTF-8&hl=" . $lang);
else
{
$index=(($pageNum - 1)*$number);
$this->content=$this->getUrlViaProxy($googleurl . "?q=".urlencode($keywords)."&start=".$index . "&num=" . $number . "&ie=UTF-8&oe=UTF-8&hl=" . $lang);
}
// echo '<pre>' . htmlspecialchars($this->content) . '</pre>';
$pos = strpos($this->content, '<td nowrap align="right"><font size="-1">');
$pos = strpos($this->content, '</b>', $pos+34);
$pos = strpos($this->content, '</b>', $pos+4);
$pos = strpos($this->content, '<b>', $pos+4);
$npos = strpos($this->content, '</b>', $pos+3);
$this->numresults = intval(str_replace('.', '', str_replace(',', '', substr($this->content, $pos+3, $npos-$pos-3))));
$this->results = array();
$pos = strpos($this->content, '<h2 class="r"><a class="l" href="', $npos+4);
while($pos != false)
{
$curr = array();
$npos = strpos($this->content, '"', $pos + 33);
if($npos == false)
break;
$curr['url'] = html_entity_decode(substr($this->content, $pos+33, $npos-$pos-33));
$pos = strpos($this->content, '>', $npos);
$npos = strpos($this->content, '</a>', $pos);
if($pos == false || $npos == false)
break;
$curr['title'] = substr($this->content, $pos+1, $npos-$pos-1);
$pos = strpos($this->content, '<div class="std"><span class="s">', $npos);
$npos = strpos($this->content, '</span>', $pos+33);
if($pos == false || $npos == false)
break;
$curr['description'] = substr($this->content, $pos+33, $npos-$pos-33);
$pos = strpos($this->content, '<nobr><a href="http://', $npos);
$tmppos=strpos($this->content, '<h2 class="r"><a class="l" href="', $npos);
if($pos && (!$tmppos || $pos < $tmppos))
{
$npos = strpos($this->content, '"', $pos+22);
$nnpos = strpos($this->content, '//', $pos+22);
$curr['cache-url'] = 'http://' . html_entity_decode(substr($this->content, $nnpos+2, $npos-$nnpos-2));
}
else
$curr['cache-url'] = null;
$pos = $tmppos;
$this->results[] = $curr;
}
}
}
?>
|