* <p>
* Script to safely serve a file for download.
* And, most important, block downloads for bots to save bandwidth on our server
* </p>
* <p>
* No harvesting allowed!
* </p>
* @author Nico den Boer <nico@nicodenboer.com>, <www.nicodenboer.com>, <www.denboer-ims.nl>
* @version 1.0.1
* @package AntiHarvestDownload
* Class to do the actual work
* @package AntiHarvestDownload
class protector {
* Holds user agents
* @var array
* @access private
private $_agents;
* Holds hosts
* @var array
* @access private
private $_hosts;
* Holds path to actual files
* @var string
* @access private
private $_path;
* Determines if we use predis or not
* @var bool
* @access private
private $_predisUse;
* Determines path to predis class
* @var string
* @access private
private $_predisPath;
* Holds the time in seconds that visits will expire
* @var int
* @access private
private $_predisTime;
* Holds the number of visits which are allowed within the timeframe
* @var int
* @access private
private $_predisCnt;
* Constructor
* @access public
* @return void
public function __construct() {
$this->_agents = array (
'wget', 'emailsiphon', 'webzip', 'msproxy/2.0',
'emailwolf', 'webbandit', 'ms frontpage', 'bot',
'slurp', 'scooter', 'spider', 'crawler',
'worm', 'internetseer.com', 'archiver', 'msnptc',
'libwww-perl', 'channel-index', 'linkwalker', 'holmes',
'holmes', 'yeti', 'indexer'
$this->_hosts = array(
'bot', 'spider', 'crawler', 'yandex'
$this->_path = '';
* Set (partial) names of user agents, which we know they are bots.
* Names should be in lowercase.
* @access public
* @param array $agents
* @return void
public function setAgents($agents) {
$this->_agents = $agents;
* Set (partial) names of which we know as bots or harvesters.
* Names should be in lowercase.
* @access public
* @param array $hosts
* @return void
public function setHosts($hosts) {
$this->_hosts = $hosts;
* Set the path to the physical downloads
* @access public
* @param string $str
* @return void
public function setPath($str) {
$this->_path = $str;
* Initialize predis
* @access public
* @param string $path
* @param int $time
* @param int $cnt
* @return void
public function setPredis($path, $time, $cnt) {
$this->_predisUse = true;
$this->_predisPath = $path;
$this->_predisTime = $time;
$this->_predisCnt = $cnt;
* Process the request
* @access public
* @return void
public function process() {
// File to download
$file = $_GET['file'];
// Clean up the file name
$file = basename($file);
$file = html_entity_decode($file, ENT_COMPAT, 'UTF-8'); // just in case
$fullPath = $this->_path . $file;
$continue = file_exists($fullPath);
if (!$continue) {
// File does not exist, return error code
header('HTTP/1.0 404 Not Found');
echo '<h1>File does not exist</h1>';
else {
// Determine extension
$pos = strrpos($file, '.');
$continue = $pos !== false;
if ($continue) {
// Determine extension and mime type
$ext = strtolower(substr($file, $pos + 1));
switch($ext) {
case 'pdf': $ctype='application/pdf'; break;
case 'zip': $ctype='application/zip'; break;
case 'doc': $ctype='application/msword'; break;
case 'xls': $ctype='application/vnd.ms-excel'; break;
case 'ppt': $ctype='application/vnd.ms-powerpoint'; break;
case 'gif': $ctype='image/gif'; break;
case 'png': $ctype='image/png'; break;
case 'jpeg':
case 'jpg': $ctype='image/jpg'; break;
case 'mpeg':
case 'mpg':
case 'mpe': $ctype='video/mpeg'; break;
case 'mov': $ctype='video/quicktime'; break;
case 'avi': $ctype='video/x-msvideo'; break;
case 'mp3': $ctype='audio/mpeg'; break;
case 'wav': $ctype='audio/x-wav'; break;
case 'xml': $ctype='text/xml'; break;
case 'txt': $ctype='text/plain'; break;
case '7z':
case 'exe': $ctype='application/octet-stream'; break;
default : $continue = false;
if (!$continue) {
// Extension not known.
header('HTTP/1.0 404 Not Found');
echo '<h1>Filename not accepted</h1>';
if ($continue) {
// See if the user agent can pass our test
foreach ($this->_agents as $value) {
if (strlen($tmp) == 0 || stripos($tmp, $value) !== false) {
// Found a user agent which we need to refuse
$continue = false;
if ($continue) {
// See if the host can pass our test
$isBot = false;
if (!isset($_SERVER['REMOTE_ADDR'])) $_SERVER['REMOTE_ADDR'] = '';
$tmp = gethostbyaddr($_SERVER['REMOTE_ADDR']);
foreach ($this->_hosts as $value) {
if (!isset($tmp) || strlen($tmp) == 0 || stripos($tmp, $value) !== false) {
// Found a host which we need to refuse
$continue = false;
$isBot = true;
else {
// User agent not allowed
$isBot = true;
if ($continue && $predisUse) {
* Predis client
$pkey = 'download_' . $_SERVER['REMOTE_ADDR']; // our checks are bound to IP numbers
// the next line will generate an error with old PHP versions.
// uncomment only if you have a recent PHP version and you want to use this functionality
// $redis = new Predis\Client();
$tmp = $redis->get($pkey);
if (!isset($tmp) || strlen($tmp) == 0) {
// New visitor
$tmp = array();
else {
// Previous visits found
$tmp = json_decode($tmp, true);
// Register this visit
$now = time();
array_push($tmp, array(
'time' => $now,
'file' => $file
// See what we need to write back and if we have a harvesting bot
$write = array();
foreach ($tmp as $value) {
if ($value['time'] > ($time - $predisTime)) {
$write[] = $value; // Keep it in the stack
// Write visits to redis
$redis->set($pkey, json_encode($write));
$isBot = count($write) > $predisCnt;
if ($isBot) {
header('HTTP/1.0 403 Forbidden');
echo '<h1>Not authorized</h1>
We have detected that you are (behaving like) a bot, which is not respecting the content
of robots.txt and harvesting files from our site.
Please correct this behavior, since it is considered quite impolite and disrespectful
to ignore explicit directions by web site owners, put into place according to
<a href="http://www.robotstxt.org/robotstxt.html">worldwide standards</a>.
elseif($continue) {
// It seems we can send this file ...
// Required for IE, otherwise Content-disposition is ignored
if (ini_get("zlib.output_compression"))
ini_set("zlib.output_compression", "Off");
header('Pragma: public'); // required
header('Last-Modified: ' . gmdate('D, d M Y H:i:s', filemtime($fullPath)) . ' GMT');
header('Expires: 0');
header('Cache-Control: must-revalidate, post-check=0, pre-check=0');
header('Cache-Control: private',false); // required for certain browsers
header(sprintf('Content-Disposition: attachment; filename="%s";', $file));
// Send Content-Transfer-Encoding HTTP header
// (use binary to prevent files from being encoded/messed up during transfer)
header('Content-Transfer-Encoding: binary');
header('Content-Length: ' . filesize($fullPath));
header('Content-Type: ' . $ctype);
header('Content-Description: File Transfer');
* Utility function. Handle a error...
* Basically, we don't handle errors in this script, but will ignore them :)
* The only error we can expect, is retrieving the host name using gethostbyaddr()
* @param int $errorNr
* @param string $errorMessage
* @param string $errorFile
* @param int $errorLine
* @return void
* @package AntiHarvestDownload
function errorHandler($errorNr, $errorMessage, $errorFile, $errorLine) {