| 
<?php
/*
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * (C) Jonathan Schmidt-Dominé 2008-2012 < [email protected] >
 */
 /*
 * Retrieves Google web search results
 */
 class GoogleCrawler
 {
 public $content;
 public $results;
 public $numresults;
 private $usebase64;
 
 private static $glypeProxies = array(
 '2proxy.info/browse.php',
 'flation.info/browse.php',
 'orbing.info/browse.php',
 'voye.info/browse.php',
 'iners.info/browse.php',
 'horted.info/browse.php',
 'omzeil.com/browse.php',
 'byewall.com/browse.php',
 'iplama.com/browse.php',
 'bouncevia.nl/browse.php'
 );
 private static $base64required = array(
 true,
 false,
 false,
 false,
 false,
 false,
 true,
 true,
 true,
 true
 );
 private static $glypeMaxTries = 5;
 
 public static function getUrl($url)
 {
 if(substr($url, 0, 7) == 'http://')
 $url = substr($url, 7);
 $slashPos = strpos($url, '/');
 $host = substr($url, 0, $slashPos);
 $path = substr($url, $slashPos);
 //         echo 'Url: ' . $url . "\n";
 //         echo 'Host: ' . $host . "\n";
 //         echo 'Path: ' . $path . "\n";
 $fp = fsockopen($host, 80);
 stream_set_timeout($fp, 5);
 if(!$fp)
 return false;
 fputs($fp, "GET $path HTTP/1.1\n");
 fputs($fp, "Host: $host\n");
 fputs($fp, "User-Agent: Mozilla/5.0 (X11; U; Linux i686; de; rv:1.9.0.3) Gecko/2008092700 Firefox/3.0.3\n");
 fputs($fp, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\n");
 fputs($fp, "Referer: http://$url\n");
 fputs($fp, "Cookie: \n");
 fputs($fp, "Keep-Alive: 115\n");
 fputs($fp, "Connection: keep-alive\n\n");
 
 $c = false;
 while(!feof($fp))
 {
 if(($tmp = fgets($fp, 128)) == "\n" || $tmp == "\r\n")
 {
 break;
 }
 else if(substr($tmp, 0, 9) == 'Location:')
 {
 fclose($fp);
 //                 echo "Redirect: " . $tmp . "\n";
 return self::getUrl(trim(substr($tmp, 9)));
 }
 }
 ob_start();
 fpassthru($fp);
 $info = stream_get_meta_data($fp);
 fclose($fp);
 if($info['timed_out'])
 {
 ob_clean();
 return false;
 }
 return ob_get_clean();
 }
 
 private static function urldecode1(array $results)
 {
 return urldecode($results[1]);
 }
 private static function removeGoogleSpyingUrl($url)
 {
 return preg_replace_callback('~^http://(?:www\.)?google\.[a-zA-Z]+/url\?q=([^&]*).*$~', array(__CLASS__, 'urldecode1'), $url);
 }
 private function base64Callback(array $match)
 {
 $tmp = urldecode(html_entity_decode($match[1]));
 if($this->usebase64)
 $tmp = base64_decode($tmp);
 return 'href="' . self::removeGoogleSpyingUrl(($this->usebase64 ? 'http' : '') . htmlspecialchars($tmp)) . '"';
 }
 
 public function getUrlViaProxy($url)
 {
 for($i = 0; $i != self::$glypeMaxTries; ++$i)
 {
 $id = rand() % count(self::$glypeProxies);
 $proxy = self::$glypeProxies[$id];
 $this->usebase64 = self::$base64required[$id];
 echo 'Proxy: ' . htmlspecialchars($proxy);
 $data = self::getUrl($proxy . "?u=" . urlencode($url) . ($this->usebase64 ? "&b=25" : "&b=0"));
 if($data)
 return preg_replace_callback('~href="http://' . $proxy . '\?u=([^&"]*)[^"]*"~', array($this, 'base64Callback'), $data);
 }
 throw new Exception("Can't get any Google-Results");
 }
 // <h2 class="r"><a class="l" href="{url-proxy-encoded}" class="l" >{title}</a>(…stuff…)<div class="std"><span class="s">{description}</span>…<span class="a">{url} - </span><nobr><a href="{cache-url-encoded}" class="fl"(…don't care…)s
 
 public function __construct($keywordsGot, $pageNum = 1, $number = 10, $lang = 'en', $googleurl = 'http://www.google.com/cse')
 {
 $keywords=$keywordsGot;
 
 if($pageNum==0)
 $this->content=$this->getUrlViaProxy($googleurl . "?q=".urlencode($keywords) . "&num=" . $number . "&ie=UTF-8&oe=UTF-8&hl=" . $lang);
 else
 {
 $index=(($pageNum - 1)*$number);
 $this->content=$this->getUrlViaProxy($googleurl . "?q=".urlencode($keywords)."&start=".$index . "&num=" . $number . "&ie=UTF-8&oe=UTF-8&hl=" . $lang);
 }
 
 //         echo '<pre>' . htmlspecialchars($this->content) . '</pre>';
 
 $pos = strpos($this->content, '<td nowrap align="right"><font size="-1">');
 $pos = strpos($this->content, '</b>', $pos+34);
 $pos = strpos($this->content, '</b>', $pos+4);
 $pos = strpos($this->content, '<b>', $pos+4);
 $npos = strpos($this->content, '</b>', $pos+3);
 $this->numresults = intval(str_replace('.', '', str_replace(',', '', substr($this->content, $pos+3, $npos-$pos-3))));
 
 $this->results = array();
 
 $pos = strpos($this->content, '<h2 class="r"><a class="l" href="', $npos+4);
 while($pos != false)
 {
 $curr = array();
 $npos = strpos($this->content, '"', $pos + 33);
 if($npos == false)
 break;
 $curr['url'] = html_entity_decode(substr($this->content, $pos+33, $npos-$pos-33));
 $pos = strpos($this->content, '>', $npos);
 $npos = strpos($this->content, '</a>', $pos);
 if($pos == false || $npos == false)
 break;
 $curr['title'] = substr($this->content, $pos+1, $npos-$pos-1);
 $pos = strpos($this->content, '<div class="std"><span class="s">', $npos);
 $npos = strpos($this->content, '</span>', $pos+33);
 if($pos == false || $npos == false)
 break;
 $curr['description'] = substr($this->content, $pos+33, $npos-$pos-33);
 $pos = strpos($this->content, '<nobr><a href="http://', $npos);
 $tmppos=strpos($this->content, '<h2 class="r"><a class="l" href="', $npos);
 if($pos && (!$tmppos || $pos < $tmppos))
 {
 $npos = strpos($this->content, '"', $pos+22);
 $nnpos = strpos($this->content, '//', $pos+22);
 $curr['cache-url'] = 'http://' . html_entity_decode(substr($this->content, $nnpos+2, $npos-$nnpos-2));
 }
 else
 $curr['cache-url'] = null;
 $pos = $tmppos;
 $this->results[] = $curr;
 }
 }
 
 }
 ?>
 |