<?php
/**
* <p>
* Script to safely serve a file for download.
* And, most important, block downloads for bots to save bandwidth on our server
* </p>
*
* <p>
* No harvesting allowed!
* </p>
*
* @author Nico den Boer <nico@nicodenboer.com>, <www.nicodenboer.com>, <www.denboer-ims.nl>
* @version 1.0.1
* @package AntiHarvestDownload
*/
/**
* Class to do the actual work
*
* @package AntiHarvestDownload
*/
class protector {
/**
* Holds user agents
*
* @var array
* @access private
*/
private $_agents;
/**
* Holds hosts
*
* @var array
* @access private
*/
private $_hosts;
/**
* Holds path to actual files
*
* @var string
* @access private
*/
private $_path;
/**
* Determines if we use predis or not
*
* @var bool
* @access private
*/
private $_predisUse;
/**
* Determines path to predis class
*
* @var string
* @access private
*/
private $_predisPath;
/**
* Holds the time in seconds that visits will expire
*
* @var int
* @access private
*/
private $_predisTime;
/**
* Holds the number of visits which are allowed within the timeframe
*
* @var int
* @access private
*/
private $_predisCnt;
/**
* Constructor
*
* @access public
* @return void
*/
public function __construct() {
$this->_agents = array (
'wget', 'emailsiphon', 'webzip', 'msproxy/2.0',
'emailwolf', 'webbandit', 'ms frontpage', 'bot',
'slurp', 'scooter', 'spider', 'crawler',
'worm', 'internetseer.com', 'archiver', 'msnptc',
'libwww-perl', 'channel-index', 'linkwalker', 'holmes',
'holmes', 'yeti', 'indexer'
);
$this->_hosts = array(
'bot', 'spider', 'crawler', 'yandex'
);
$this->_path = '';
}
/**
* Set (partial) names of user agents, which we know they are bots.
* Names should be in lowercase.
*
* @access public
* @param array $agents
* @return void
*/
public function setAgents($agents) {
$this->_agents = $agents;
}
/**
* Set (partial) names of which we know as bots or harvesters.
* Names should be in lowercase.
*
* @access public
* @param array $hosts
* @return void
*/
public function setHosts($hosts) {
$this->_hosts = $hosts;
}
/**
* Set the path to the physical downloads
*
* @access public
* @param string $str
* @return void
*/
public function setPath($str) {
$this->_path = $str;
}
/**
* Initialize predis
*
* @access public
* @param string $path
* @param int $time
* @param int $cnt
* @return void
*/
public function setPredis($path, $time, $cnt) {
$this->_predisUse = true;
$this->_predisPath = $path;
$this->_predisTime = $time;
$this->_predisCnt = $cnt;
}
/**
* Process the request
*
* @access public
* @return void
*/
public function process() {
// File to download
$file = $_GET['file'];
// Clean up the file name
$file = basename($file);
$file = html_entity_decode($file, ENT_COMPAT, 'UTF-8'); // just in case
$fullPath = $this->_path . $file;
$continue = file_exists($fullPath);
if (!$continue) {
// File does not exist, return error code
header('HTTP/1.0 404 Not Found');
echo '<h1>File does not exist</h1>';
}
else {
// Determine extension
$pos = strrpos($file, '.');
$continue = $pos !== false;
}
if ($continue) {
// Determine extension and mime type
$ext = strtolower(substr($file, $pos + 1));
switch($ext) {
case 'pdf': $ctype='application/pdf'; break;
case 'zip': $ctype='application/zip'; break;
case 'doc': $ctype='application/msword'; break;
case 'xls': $ctype='application/vnd.ms-excel'; break;
case 'ppt': $ctype='application/vnd.ms-powerpoint'; break;
case 'gif': $ctype='image/gif'; break;
case 'png': $ctype='image/png'; break;
case 'jpeg':
case 'jpg': $ctype='image/jpg'; break;
case 'mpeg':
case 'mpg':
case 'mpe': $ctype='video/mpeg'; break;
case 'mov': $ctype='video/quicktime'; break;
case 'avi': $ctype='video/x-msvideo'; break;
case 'mp3': $ctype='audio/mpeg'; break;
case 'wav': $ctype='audio/x-wav'; break;
case 'xml': $ctype='text/xml'; break;
case 'txt': $ctype='text/plain'; break;
case '7z':
case 'exe': $ctype='application/octet-stream'; break;
default : $continue = false;
}
if (!$continue) {
// Extension not known.
header('HTTP/1.0 404 Not Found');
echo '<h1>Filename not accepted</h1>';
}
}
if ($continue) {
// See if the user agent can pass our test
if (!isset($_SERVER['HTTP_USER_AGENT'])) $_SERVER['HTTP_USER_AGENT'] = '';
$tmp = $_SERVER['HTTP_USER_AGENT'];
foreach ($this->_agents as $value) {
if (strlen($tmp) == 0 || stripos($tmp, $value) !== false) {
// Found a user agent which we need to refuse
$continue = false;
break;
}
}
}
if ($continue) {
// See if the host can pass our test
$isBot = false;
if (!isset($_SERVER['REMOTE_ADDR'])) $_SERVER['REMOTE_ADDR'] = '';
$tmp = gethostbyaddr($_SERVER['REMOTE_ADDR']);
foreach ($this->_hosts as $value) {
if (!isset($tmp) || strlen($tmp) == 0 || stripos($tmp, $value) !== false) {
// Found a host which we need to refuse
$continue = false;
$isBot = true;
break;
}
}
}
else {
// User agent not allowed
$isBot = true;
}
if ($continue && $predisUse) {
/**
* Predis client
*/
require_once($predisPath);
$pkey = 'download_' . $_SERVER['REMOTE_ADDR']; // our checks are bound to IP numbers
// the next line will generate an error with old PHP versions.
// uncomment only if you have a recent PHP version and you want to use this functionality
// $redis = new Predis\Client();
$tmp = $redis->get($pkey);
if (!isset($tmp) || strlen($tmp) == 0) {
// New visitor
$tmp = array();
}
else {
// Previous visits found
$tmp = json_decode($tmp, true);
}
// Register this visit
$now = time();
array_push($tmp, array(
'time' => $now,
'file' => $file
));
// See what we need to write back and if we have a harvesting bot
$write = array();
foreach ($tmp as $value) {
if ($value['time'] > ($time - $predisTime)) {
$write[] = $value; // Keep it in the stack
}
}
// Write visits to redis
$redis->set($pkey, json_encode($write));
$isBot = count($write) > $predisCnt;
}
if ($isBot) {
header('HTTP/1.0 403 Forbidden');
echo '<h1>Not authorized</h1>
<p>
We have detected that you are (behaving like) a bot, which is not respecting the content
of robots.txt and harvesting files from our site.
</p>
<p>
Please correct this behavior, since it is considered quite impolite and disrespectful
to ignore explicit directions by web site owners, put into place according to
<a href="http://www.robotstxt.org/robotstxt.html">worldwide standards</a>.
</p>';
}
elseif($continue) {
// It seems we can send this file ...
// Required for IE, otherwise Content-disposition is ignored
if (ini_get("zlib.output_compression"))
ini_set("zlib.output_compression", "Off");
header('Pragma: public'); // required
header('Last-Modified: ' . gmdate('D, d M Y H:i:s', filemtime($fullPath)) . ' GMT');
header('Expires: 0');
header('Cache-Control: must-revalidate, post-check=0, pre-check=0');
header('Cache-Control: private',false); // required for certain browsers
header(sprintf('Content-Disposition: attachment; filename="%s";', $file));
// Send Content-Transfer-Encoding HTTP header
// (use binary to prevent files from being encoded/messed up during transfer)
header('Content-Transfer-Encoding: binary');
header('Content-Length: ' . filesize($fullPath));
header('Content-Type: ' . $ctype);
header('Content-Description: File Transfer');
readfile($fullPath);
}
}
}
/**
* Utility function. Handle a error...
*
* Basically, we don't handle errors in this script, but will ignore them :)
*
* The only error we can expect, is retrieving the host name using gethostbyaddr()
*
* @param int $errorNr
* @param string $errorMessage
* @param string $errorFile
* @param int $errorLine
* @return void
* @package AntiHarvestDownload
*/
function errorHandler($errorNr, $errorMessage, $errorFile, $errorLine) {
}
set_error_handler('errorHandler');
?>
|