<?php
/**
* phpWebHacks.php 1.5
* This class is a powerful tool for HTTP scripting with PHP.
* It simulates a web browser, only that you use it with lines of code
* rather than mouse and keyboard.
*
* See the documentation at http://php-http.com/documentation
* See the examples at http://php-http.com/examples
*
* Author Nashruddin Amin - me@nashruddin.com
* License GPL
* Website http://php-http.com
*/
class phpWebHacks
{
private $_user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9) Gecko/2008052906 Firefox/3.0';
private $_boundary = '----PhPWebhACKs-RoCKs--';
private $_useproxy = false;
private $_proxy_host = '';
private $_proxy_port = '';
private $_proxy_user = '';
private $_proxy_pass = '';
private $_usegzip = false;
private $_log = false;
private $_debugdir = '.log';
private $_debugnum = 1;
private $_delay = 1;
private $_body = array();
private $_cookies = array();
private $_addressbar = '';
private $_multipart = false;
private $_timestart = 0;
private $_bytes = 0;
/**
* Constructor
*/
public function __construct()
{
$this->setDebug(true);
/* check if zlib is available */
if (function_exists('gzopen')) {
$this->_usegzip = true;
}
/* start time */
$this->_timestart = microtime(true);
}
/**
* Destructor
*/
public function __destruct()
{
/* remove temporary file for gzip encoding */
if (file_exists('tmp.gz')) {
unlink('tmp.gz');
}
/* get elapsed time and transferred bytes */
$time = sprintf("%02.1f", microtime(true) - $this->_timestart);
$bytes = sprintf("%d", ceil($this->_bytes / 1024));
/* log */
if ($this->_log) {
$fp = fopen("$this->_debugdir/headers.txt", 'a');
fputs($fp, "------ Transferred " . $bytes . "kb in $time sec ------\r\n");
fclose($fp);
}
}
/**
* HEAD
*/
public function head($url)
{
return $this->fetch($url, 'HEAD');
}
/**
* GET
*/
public function get($url)
{
return $this->fetch($url, 'GET');
}
/**
* POST
*/
public function post($url, $form = array(), $files = array())
{
return $this->fetch($url, 'POST', 10, $form, $files);
}
/**
* Make HTTP request
*/
protected function fetch($url, $method, $maxredir = 10, $form = array(), $files = array())
{
/* convert to absolute if relative URL */
$url = $this->getAbsUrl($url, $this->_addressbar);
/* only http or https */
if (substr($url, 0, 4) != 'http') return '';
/* cache URL */
$this->_addressbar = $url;
/* build request */
$reqbody = $this->getReqBody($form, $files);
$reqhead = $this->getReqHead($url, $method, strlen($reqbody), empty($files) ? false : true);
/* log request */
if ($this->_log) {
$this->logHttpStream($url, $reqhead, $reqbody);
}
/* parse URL and convert to local variables:
$scheme, $host, $path */
$parts = parse_url($url);
if (!$parts) {
die("Invalid URL!\n");
} else {
foreach($parts as $key=>$val) $$key = $val;
}
/* open connection */
if ($this->_useproxy) {
$fp = @fsockopen($this->_proxy_host, $this->_proxy_port);
} else {
$fp = @fsockopen(($scheme=='https' ? "ssl://$host" : $host), $scheme == 'https' ? 443 : 80);
}
/* always check */
if (!$fp) {
die("Cannot connect to $host!\n");
}
/* send request & read response */
@fputs($fp, $reqhead.$reqbody);
for($res=''; !feof($fp); $res.=@fgets($fp, 4096)) {}
fclose($fp);
/* set delay between requests. behave! */
sleep($this->_delay);
/* transferred bytes */
$this->_bytes += (strlen($reqhead)+ strlen($reqbody)+ strlen($res));
/* get response header & body */
list($reshead, $resbody) = explode("\r\n\r\n", $res, 2);
/* convert header to associative array */
$head = $this->parseHead($reshead);
/* return immediately if HEAD */
if ($method == 'HEAD') {
if ($this->_log) $this->logHttpStream($url, $reshead, null);
return $head;
}
/* cookies */
if (!empty($head['Set-Cookie'])) {
$this->saveCookies($head['Set-Cookie'], $url);
}
/* referer */
if ($head['Status']['Code'] == 200) {
$this->_referer = $url;
}
/* transfer-encoding: chunked */
if ($head['Transfer-Encoding'] == 'chunked') {
$body = $this->joinChunks($resbody);
} else {
$body = $resbody;
}
/* content-encoding: gzip */
if ($head['Content-Encoding'] == 'gzip') {
@file_put_contents('tmp.gz', $body);
$fp = @gzopen('tmp.gz', 'r');
for($body = ''; !@gzeof($fp); $body.=@gzgets($fp, 4096)) {}
@gzclose($fp);
}
/* log response */
if ($this->_log) {
$this->logHttpStream($url, $reshead, $body);
}
/* cache body */
array_unshift($this->_body, $body);
/* redirects: 302 */
if (isset($head['Location']) && $maxredir > 0) {
$this->fetch($this->getAbsUrl($head['Location'], $url), 'GET', $maxredir--);
}
/* parse meta tags */
$meta = $this->parseMetaTags($body);
/* redirects: <meta http-equiv=refresh...> */
if (isset($meta['http-equiv']['refresh']) && $maxredir > 0) {
list($delay, $loc) = explode(';', $meta['http-equiv']['refresh'], 2);
$loc = substr(trim($loc), 4);
if (!empty($loc) && $loc != $url)
$this->fetch($this->getAbsUrl($loc, $url), 'GET', $maxredir--);
}
/* get body and clear cache */
$body = $this->_body[0];
for($i = 1; $i < count($this->_body); $i++) {
unset($this->_body[$i]);
}
return $body;
}
/**
* Build request header
*/
protected function getReqHead($url, $method, $bodylen = 0, $sendfile = true)
{
/* parse URL elements to local variables:
$scheme, $host, $path, $query, $user, $pass */
$parts = parse_url($url);
foreach($parts as $key=>$val) $$key = $val;
/* setup path */
$path = empty($path) ? '/' : $path
.(empty($query) ? '' : "?$query");
/* request header */
if ($this->_useproxy) {
$head = "$method $url HTTP/1.1\r\nHost: $this->_proxy_host\r\n";
} else {
$head = "$method $path HTTP/1.1\r\nHost: $host\r\n";
}
/* cookies */
$head .= $this->getCookies($url);
/* content-type */
if ($method == 'POST' && ($sendfile || $this->_multipart)) {
$head .= "Content-Type: multipart/form-data; boundary=$this->_boundary\r\n";
} elseif ($method == 'POST') {
$head .= "Content-Type: application/x-www-form-urlencoded\r\n";
}
/* set the content length if POST */
if ($method == 'POST') {
$head .= "Content-Length: $bodylen\r\n";
}
/* basic authentication */
if (!$this->_useproxy && !empty($user) && !empty($pass)) {
$head .= "Authorization: Basic ". base64_encode("$user:$pass")."\r\n";
}
/* basic authentication for proxy */
if ($this->_useproxy && !empty($this->_proxy_user) && !empty($this->_proxy_pass)) {
$head .= "Authorization: Basic ". base64_encode("$this->_proxy_user:$this->_proxy_pass")."\r\n";
}
/* gzip */
if ($this->_usegzip) {
$head .= "Accept-Encoding: gzip\r\n";
}
/* make it like real browsers */
if (!empty($this->_user_agent)) {
$head .= "User-Agent: $this->_user_agent\r\n";
}
if (!empty($this->_referer)) {
$head .= "Referer: $this->_referer\r\n";
}
/* no pipelining yet */
$head .= "Connection: Close\r\n\r\n";
/* request header is ready */
return $head;
}
/**
* Build request body
*/
protected function getReqBody($form = array(), $files = array())
{
/* check for parameters */
if (empty($form) && empty($files))
return '';
$body = '';
$tmp = array();
/* only form available: x-www-urlencoded */
if (!empty($form) && empty($files) && !$this->_multipart) {
foreach($form as $key=>$val)
$tmp[] = $key .'='. urlencode($val);
return implode('&', $tmp);
}
/* form */
foreach($form as $key=>$val) {
$body .= "--$this->_boundary\r\nContent-Disposition: form-data; name=\"" . $key ."\"\r\n\r\n" . $val ."\r\n";
}
/* files */
foreach($files as $key=>$val) {
if (!file_exists($val)) continue;
$body .= "--$this->_boundary\r\n"
. "Content-Disposition: form-data; name=\"" . $key . "\"; filename=\"" . basename($val) . "\"\r\n"
. "Content-Type: " . $this->getMimeType($val) . "\r\n\r\n"
. file_get_contents($val) . "\r\n";
}
/* request body is ready! */
return $body."--$this->_boundary--";
}
/**
* convert response header to associative array
*/
protected function parseHead($str)
{
$lines = explode("\r\n", $str);
list($ver, $code, $msg) = explode(' ', array_shift($lines), 3);
$stat = array('Version' => $ver, 'Code' => $code, 'Message' => $msg);
$head = array('Status' => $stat);
foreach($lines as $line) {
list($key, $val) = explode(':', $line, 2);
if ($key == 'Set-Cookie') {
$head['Set-Cookie'][] = trim($val);
} else {
$head[$key] = trim($val);
}
}
return $head;
}
/**
* Read chunked pages
*/
protected function joinChunks($str)
{
$CRLF = "\r\n";
for($tmp = $str, $res = ''; !empty($tmp); $tmp = trim($tmp)) {
if (($pos = strpos($tmp, $CRLF)) === false) return $str;
$len = hexdec(substr($tmp, 0, $pos));
$res.= substr($tmp, $pos + strlen($CRLF), $len);
$tmp = substr($tmp, $pos + strlen($CRLF) + $len);
}
return $res;
}
/**
* Save cookies from server
*/
protected function saveCookies($set_cookies, $url)
{
foreach($set_cookies as $str)
{
$parts = explode(';', $str);
/* extract cookie parts to local variables:
$name, $value, $domain, $path, $expires, $secure, $httponly */
foreach($parts as $part) {
list($key, $val) = explode('=', trim($part), 2);
$k = strtolower($key);
if ($k == 'secure' || $k == 'httponly') {
$$k = true;
} elseif ($k == 'domain' || $k == 'path' || $k == 'expires') {
$$k = $val;
} else {
$name = $key;
$value = $val;
}
}
/* cookie's domain */
if (empty($domain)) {
$domain = parse_url($url, PHP_URL_HOST);
}
/* cookie's path */
if (empty($path)) {
$path = parse_url($url, PHP_URL_PATH);
$path = preg_replace('#/[^/]*$#', '', $path);
$path = empty($path) ? '/' : $path;
}
/* cookie's expire time */
if (!empty($expires)) {
$expires = strtotime($expires);
}
/* setup cookie ID, a simple trick to add/update existing cookie
and cleanup local variables later */
$id = md5("$domain;$path;$name");
/* add/update cookie */
$this->_cookies[$id] = array(
'domain' => substr_count($domain, '.') == 1 ? ".$domain" : $domain,
'path' => $path,
'expires' => $expires,
'name' => $name,
'value' => $value,
'secure' => $secure,
'httponly' => $httponly
);
/* cleanup local variables */
foreach($this->_cookies[$id] as $key=>$val) unset($$key);
}
return true;
}
/**
* Get cookies for URL
*/
protected function getCookies($url)
{
$tmp = array();
$res = array();
/* remove expired cookies first */
foreach($this->_cookies as $id=>$cookie) {
if (empty($cookie['expires']) || $cookie['expires'] >= time()) {
$tmp[$id] = $cookie;
}
}
/* cookies ready */
$this->_cookies = $tmp;
/* parse URL to local variables:
$scheme, $host, $path, $query */
$parts = parse_url($url);
foreach($parts as $key=>$val) $$key = $val;
if (empty($path)) $path = '/';
/* get all cookies for this domain and path */
foreach($this->_cookies as $cookie) {
$d = substr($host, -1 * strlen($cookie['domain']));
$p = substr($path, 0, strlen($cookie['path']));
if (($d == $cookie['domain'] || ".$d" == $cookie['domain']) && $p == $cookie['path']) {
if ($cookie['secure'] == true && $scheme == 'http') {
continue;
}
$res[] = $cookie['name'].'='.$cookie['value'];
}
}
/* return the string for HTTP header */
return (empty($res) ? '' : 'Cookie: '.implode('; ', $res)."\r\n");
}
/**
* Convert relative URL to absolute URL
*/
protected function getAbsUrl($loc, $parent)
{
/* parameters is required */
if (empty($loc) && empty($parent)) return;
$loc = str_replace('&', '&', $loc);
/* return if URL is abolute */
if (parse_url($loc, PHP_URL_SCHEME) != '') return $loc;
/* handle anchors and query's part */
$c = substr($loc, 0, 1);
if ($c == '#' || $c == '&') return "$parent$loc";
/* handle query string */
if ($c == '?') {
$pos = strpos($parent, '?');
if ($pos !== false) $parent = substr($parent, 0, $pos);
return "$parent$loc";
}
/* parse URL and convert to local variables:
$scheme, $host, $path */
$parts = parse_url($parent);
foreach ($parts as $key=>$val) $$key = $val;
/* remove non-directory part from path */
$path = preg_replace('#/[^/]*$#', '', $path);
/* set path to '/' if empty */
$path = preg_match('#^/#', $loc) ? '/' : $path;
/* dirty absolute URL */
$abs = "$host$path/$loc";
/* replace '//', '/./', '/foo/../' with '/' */
while($abs = preg_replace(array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#'), '/', $abs, -1, $count))
if (!$count) break;
/* absolute URL */
return "$scheme://$abs";
}
/**
* Convert meta tags to associative array
*/
protected function parseMetaTags($html)
{
/* extract to </head> */
if (($pos = strpos(strtolower($html), '</head>')) === false) {
return array();
} else {
$head = substr($html, 0, $pos);
}
/* get page's title */
preg_match("/<title>(.+)<\/title>/siU", $head, $m);
$meta = array('title' => $m[1]);
/* get all <meta...> */
preg_match_all('/<meta\s+[^>]*name\s*=\s*[\'"][^>]+>/siU', $head, $m);
foreach($m[0] as $row) {
preg_match('/name\s*=\s*[\'"](.+)[\'"]/siU', $row, $key);
preg_match('/content\s*=\s *[\'"](.+)[\'"]/siU', $row, $val);
if (!empty($key[1]) && !empty($val[1]))
$meta[$key[1]] = $val[1];
}
/* get <meta http-equiv=refresh...> */
preg_match('/<meta[^>]+http-equiv\s*=\s*[\'"]?refresh[\'"]?[^>]+content\s*=\s*[\'"](.+)[\'"][^>]*>/siU', $head, $m);
if (!empty($m[1])) {
$meta['http-equiv']['refresh'] = preg_replace('/�?39;/', '', $m[1]);
}
return $meta;
}
/**
* Convert form to associative array
*/
public function parseForm($name_or_id, $action = '', $str = '')
{
if (empty($str) && empty($this->_body[0]))
return array();
$body = empty($str) ? $this->_body[0] : $str;
/* extract the form */
$re = '(<form[^>]+(id|name)\s*=\s*(?(?=[\'"])[\'"]'.$name_or_id.'[\'"]|\b'.$name_or_id.'\b)[^>]*>.+<\/form>)';
if (!preg_match("/$re/siU", $body, $form))
return array();
/* check if enctype=multipart/form-data */
if (preg_match('/<form[^>]+enctype[^>]+multipart\/form-data[^>]*>/siU', $form[1], $a))
$this->_multipart = true;
else
$this->_multipart = false;
/* get form's action */
preg_match('/<form[^>]+action\s*=\s*(?(?=[\'"])[\'"]([^\'"]+)[\'"]|([^>\s]+))[^>]*>/si', $form[1], $a);
$action = empty($a[1]) ? html_entity_decode($a[2]) : html_entity_decode($a[1]);
/* select all <select..> with default values */
$re = '<select[^>]+name\s*=\s*(?(?=[\'"])[\'"]([^>]+)[\'"]|\b([^>]+)\b)[^>]*>'
. '.+value\s*=\s*(?(?=[\'"])[\'"]([^>]+)[\'"]|\b([^>]+)\b)[^>]+\bselected\b'
. '.+<\/select>';
preg_match_all("/$re/siU", $form[1], $a);
foreach($a[1] as $num=>$key) {
$val = $a[3][$num];
if ($val == '') $val = $a[4][$num];
if ($key == '') $key = $a[2][$num];
$res[$key] = html_entity_decode($val);
}
/* get all <input...> */
preg_match_all('/<input([^>]+)\/?>/siU', $form[1], $a);
/* convert to associative array */
foreach($a[1] as $b) {
preg_match_all('/([a-z]+)\s*=\s*(?(?=[\'"])[\'"]([^"]+)[\'"]|\b(.+)\b)/siU', trim($b), $c);
$element = array();
foreach($c[1] as $num=>$key) {
$val = $c[2][$num];
if ($val == '') $val = $c[3][$num];
$element[$key] = $val;
}
$type = strtolower($element['type']);
/* only radio or checkbox with default values */
if ($type == 'radio' || $type == 'checkbox')
if (!preg_match('/\s+\bchecked\b/', $b)) continue;
/* remove buttons and file */
if ($type == 'file' || $type == 'submit' || $type == 'reset' || $type == 'button')
continue;
/* remove unnamed elements */
if ($element['name'] == '' && $element['id'] == '')
continue;
/* cool */
$key = $element['name'] == '' ? $element['id'] : $element['name'];
$res[$key] = html_entity_decode($element['value']);
}
return $res;
}
/**
* Get mime type for a file
*/
protected function getMimeType($filename)
{
/* list of mime type. add more rows to suit your need */
$mimetypes = array(
'jpg' => 'image/jpeg',
'jpe' => 'image/jpeg',
'jpeg' => 'image/jpeg',
'gif' => 'image/gif',
'png' => 'image/png',
'tiff' => 'image/tiff',
'html' => 'text/html',
'txt' => 'text/plain',
'pdf' => 'application/pdf',
'zip' => 'application/zip'
);
/* get file extension */
preg_match('#\.([^\.]+)$#', strtolower($filename), $e);
/* get mime type */
foreach($mimetypes as $ext=>$mime)
if ($e[1] == $ext) return $mime;
/* this is the default mime type */
return 'application/octet-stream';
}
/**
* Log HTTP request/response
*/
protected function logHttpStream($url, $head, $body)
{
/* open log file */
if (($fp = @fopen("$this->_debugdir/headers.txt", 'a')) == false) return;
/* get method */
$m = substr($head, 0, 4);
/* append the requested URL for HEAD, GET and POST */
if ($m == 'HEAD' || $m == 'GET ' || $m == 'POST')
$head = str_repeat('-', 90) . "\r\n$url\r\n\r\n" . trim($head);
/* header */
@fputs($fp, trim($head)."\r\n\r\n");
/* request body */
if ($m == 'POST' && strpos($head, 'Content-Length: ') !== false) {
/* skip binary contents */
$find = 'Content-Type: \s*([^\s]+)\r\n\r\n(.+)\r\n';
$repl = "Content-Type: $1\r\n\r\n <... File contents ...>\r\n";
$body = preg_replace('/'.$find .'/siU', $repl, $body);
@fputs($fp, "$body\r\n\r\n");
}
/* response body */
if (substr($head, 0, 7) == 'HTTP/1.' && strpos($head, 'text/html') !== false && !empty($body)) {
$tmp = "$this->_debugdir/" . $this->_debugnum++ . '.html';
@file_put_contents($tmp, $body);
@fputs($fp, "<... See page contents in $tmp ...>\r\n\r\n");
}
@fclose($fp);
}
public function setDebug($bool)
{
$this->_log = $bool;
if (!$this->_log) return;
/* create directory */
if (!is_dir($this->_debugdir)) {
mkdir($this->_debugdir);
chmod($this->_debugdir, 0644);
}
/* empty debug directory */
$items = scandir($this->_debugdir);
foreach($items as $item) {
if ($item == '.' || $item == '..') continue;
unlink("$this->_debugdir/$item");
}
}
/**
* Set proxy
*/
public function setProxy($host, $port, $user = '', $pass = '')
{
$this->_proxy_host = $host;
$this->_proxy_port = $port;
$this->_proxy_user = $user;
$this->_proxy_pass = $pass;
$this->_useproxy = true;
}
/**
* Set delay between requests
*/
public function setInterval($sec)
{
if (!preg_match('/^\d+$/', $sec) || $sec <= 0) {
$this->_delay = 1;
} else {
$this->_delay = $sec;
}
}
/**
* Assign a name for this HTTP client
*/
public function setUserAgent($ua)
{
$this->_user_agent = $ua;
}
}
|