<?php /** * <p> * Script to safely serve a file for download. * And, most important, block downloads for bots to save bandwidth on our server * </p> * * <p> * No harvesting allowed! * </p> * * @author Nico den Boer <nico@nicodenboer.com>, <www.nicodenboer.com>, <www.denboer-ims.nl> * @version 1.0.1 * @package AntiHarvestDownload */
/** * Class to do the actual work * * @package AntiHarvestDownload */ class protector { /** * Holds user agents * * @var array * @access private */ private $_agents; /** * Holds hosts * * @var array * @access private */ private $_hosts; /** * Holds path to actual files * * @var string * @access private */ private $_path; /** * Determines if we use predis or not * * @var bool * @access private */ private $_predisUse; /** * Determines path to predis class * * @var string * @access private */ private $_predisPath; /** * Holds the time in seconds that visits will expire * * @var int * @access private */ private $_predisTime; /** * Holds the number of visits which are allowed within the timeframe * * @var int * @access private */ private $_predisCnt; /** * Constructor * * @access public * @return void */ public function __construct() { $this->_agents = array ( 'wget', 'emailsiphon', 'webzip', 'msproxy/2.0', 'emailwolf', 'webbandit', 'ms frontpage', 'bot', 'slurp', 'scooter', 'spider', 'crawler', 'worm', 'internetseer.com', 'archiver', 'msnptc', 'libwww-perl', 'channel-index', 'linkwalker', 'holmes', 'holmes', 'yeti', 'indexer' ); $this->_hosts = array( 'bot', 'spider', 'crawler', 'yandex' ); $this->_path = ''; } /** * Set (partial) names of user agents, which we know they are bots. * Names should be in lowercase. * * @access public * @param array $agents * @return void */ public function setAgents($agents) { $this->_agents = $agents; } /** * Set (partial) names of which we know as bots or harvesters. * Names should be in lowercase. * * @access public * @param array $hosts * @return void */ public function setHosts($hosts) { $this->_hosts = $hosts; } /** * Set the path to the physical downloads * * @access public * @param string $str * @return void */ public function setPath($str) { $this->_path = $str; } /** * Initialize predis * * @access public * @param string $path * @param int $time * @param int $cnt * @return void */ public function setPredis($path, $time, $cnt) { $this->_predisUse = true; $this->_predisPath = $path; $this->_predisTime = $time; $this->_predisCnt = $cnt; } /** * Process the request * * @access public * @return void */ public function process() {
// File to download $file = $_GET['file'];
// Clean up the file name $file = basename($file); $file = html_entity_decode($file, ENT_COMPAT, 'UTF-8'); // just in case
$fullPath = $this->_path . $file; $continue = file_exists($fullPath); if (!$continue) { // File does not exist, return error code header('HTTP/1.0 404 Not Found'); echo '<h1>File does not exist</h1>'; } else { // Determine extension $pos = strrpos($file, '.'); $continue = $pos !== false; }
if ($continue) { // Determine extension and mime type $ext = strtolower(substr($file, $pos + 1)); switch($ext) { case 'pdf': $ctype='application/pdf'; break; case 'zip': $ctype='application/zip'; break; case 'doc': $ctype='application/msword'; break; case 'xls': $ctype='application/vnd.ms-excel'; break; case 'ppt': $ctype='application/vnd.ms-powerpoint'; break; case 'gif': $ctype='image/gif'; break; case 'png': $ctype='image/png'; break; case 'jpeg': case 'jpg': $ctype='image/jpg'; break; case 'mpeg': case 'mpg': case 'mpe': $ctype='video/mpeg'; break; case 'mov': $ctype='video/quicktime'; break; case 'avi': $ctype='video/x-msvideo'; break; case 'mp3': $ctype='audio/mpeg'; break; case 'wav': $ctype='audio/x-wav'; break; case 'xml': $ctype='text/xml'; break; case 'txt': $ctype='text/plain'; break; case '7z': case 'exe': $ctype='application/octet-stream'; break; default : $continue = false; } if (!$continue) { // Extension not known. header('HTTP/1.0 404 Not Found'); echo '<h1>Filename not accepted</h1>'; } }
if ($continue) { // See if the user agent can pass our test if (!isset($_SERVER['HTTP_USER_AGENT'])) $_SERVER['HTTP_USER_AGENT'] = ''; $tmp = $_SERVER['HTTP_USER_AGENT']; foreach ($this->_agents as $value) { if (strlen($tmp) == 0 || stripos($tmp, $value) !== false) { // Found a user agent which we need to refuse $continue = false; break; } } }
if ($continue) { // See if the host can pass our test $isBot = false; if (!isset($_SERVER['REMOTE_ADDR'])) $_SERVER['REMOTE_ADDR'] = ''; $tmp = gethostbyaddr($_SERVER['REMOTE_ADDR']); foreach ($this->_hosts as $value) { if (!isset($tmp) || strlen($tmp) == 0 || stripos($tmp, $value) !== false) { // Found a host which we need to refuse $continue = false; $isBot = true; break; } } } else { // User agent not allowed $isBot = true; }
if ($continue && $predisUse) { /** * Predis client */ require_once($predisPath); $pkey = 'download_' . $_SERVER['REMOTE_ADDR']; // our checks are bound to IP numbers // the next line will generate an error with old PHP versions. // uncomment only if you have a recent PHP version and you want to use this functionality // $redis = new Predis\Client(); $tmp = $redis->get($pkey); if (!isset($tmp) || strlen($tmp) == 0) { // New visitor $tmp = array(); } else { // Previous visits found $tmp = json_decode($tmp, true); } // Register this visit $now = time(); array_push($tmp, array( 'time' => $now, 'file' => $file )); // See what we need to write back and if we have a harvesting bot $write = array(); foreach ($tmp as $value) { if ($value['time'] > ($time - $predisTime)) { $write[] = $value; // Keep it in the stack } } // Write visits to redis $redis->set($pkey, json_encode($write)); $isBot = count($write) > $predisCnt; }
if ($isBot) { header('HTTP/1.0 403 Forbidden'); echo '<h1>Not authorized</h1> <p> We have detected that you are (behaving like) a bot, which is not respecting the content of robots.txt and harvesting files from our site. </p> <p> Please correct this behavior, since it is considered quite impolite and disrespectful to ignore explicit directions by web site owners, put into place according to <a href="http://www.robotstxt.org/robotstxt.html">worldwide standards</a>. </p>'; } elseif($continue) { // It seems we can send this file ... // Required for IE, otherwise Content-disposition is ignored if (ini_get("zlib.output_compression")) ini_set("zlib.output_compression", "Off"); header('Pragma: public'); // required header('Last-Modified: ' . gmdate('D, d M Y H:i:s', filemtime($fullPath)) . ' GMT'); header('Expires: 0'); header('Cache-Control: must-revalidate, post-check=0, pre-check=0'); header('Cache-Control: private',false); // required for certain browsers header(sprintf('Content-Disposition: attachment; filename="%s";', $file)); // Send Content-Transfer-Encoding HTTP header // (use binary to prevent files from being encoded/messed up during transfer) header('Content-Transfer-Encoding: binary'); header('Content-Length: ' . filesize($fullPath)); header('Content-Type: ' . $ctype); header('Content-Description: File Transfer'); readfile($fullPath); } } }
/** * Utility function. Handle a error... * * Basically, we don't handle errors in this script, but will ignore them :) * * The only error we can expect, is retrieving the host name using gethostbyaddr() * * @param int $errorNr * @param string $errorMessage * @param string $errorFile * @param int $errorLine * @return void * @package AntiHarvestDownload */ function errorHandler($errorNr, $errorMessage, $errorFile, $errorLine) { }
set_error_handler('errorHandler');
?>
|