PHP Classes

File: phpWebHacks.php

Recommend this page to a friend!
  Classes of Nashruddin Amin   Hacker's HTTP Client   phpWebHacks.php   Download  
File: phpWebHacks.php
Role: Class source
Content type: text/plain
Description: HTTP client
Class: Hacker's HTTP Client
Automate retrieval and processing of Web pages
Author: By
Last change: change listing priority to 1
Date: 15 years ago
Size: 19,921 bytes
 

Contents

Class file image Download
<?php /** * phpWebHacks.php 1.5 * This class is a powerful tool for HTTP scripting with PHP. * It simulates a web browser, only that you use it with lines of code * rather than mouse and keyboard. * * See the documentation at http://php-http.com/documentation * See the examples at http://php-http.com/examples * * Author Nashruddin Amin - me@nashruddin.com * License GPL * Website http://php-http.com */ class phpWebHacks { private $_user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9) Gecko/2008052906 Firefox/3.0'; private $_boundary = '----PhPWebhACKs-RoCKs--'; private $_useproxy = false; private $_proxy_host = ''; private $_proxy_port = ''; private $_proxy_user = ''; private $_proxy_pass = ''; private $_usegzip = false; private $_log = false; private $_debugdir = '.log'; private $_debugnum = 1; private $_delay = 1; private $_body = array(); private $_cookies = array(); private $_addressbar = ''; private $_multipart = false; private $_timestart = 0; private $_bytes = 0; /** * Constructor */ public function __construct() { $this->setDebug(true); /* check if zlib is available */ if (function_exists('gzopen')) { $this->_usegzip = true; } /* start time */ $this->_timestart = microtime(true); } /** * Destructor */ public function __destruct() { /* remove temporary file for gzip encoding */ if (file_exists('tmp.gz')) { unlink('tmp.gz'); } /* get elapsed time and transferred bytes */ $time = sprintf("%02.1f", microtime(true) - $this->_timestart); $bytes = sprintf("%d", ceil($this->_bytes / 1024)); /* log */ if ($this->_log) { $fp = fopen("$this->_debugdir/headers.txt", 'a'); fputs($fp, "------ Transferred " . $bytes . "kb in $time sec ------\r\n"); fclose($fp); } } /** * HEAD */ public function head($url) { return $this->fetch($url, 'HEAD'); } /** * GET */ public function get($url) { return $this->fetch($url, 'GET'); } /** * POST */ public function post($url, $form = array(), $files = array()) { return $this->fetch($url, 'POST', 10, $form, $files); } /** * Make HTTP request */ protected function fetch($url, $method, $maxredir = 10, $form = array(), $files = array()) { /* convert to absolute if relative URL */ $url = $this->getAbsUrl($url, $this->_addressbar); /* only http or https */ if (substr($url, 0, 4) != 'http') return ''; /* cache URL */ $this->_addressbar = $url; /* build request */ $reqbody = $this->getReqBody($form, $files); $reqhead = $this->getReqHead($url, $method, strlen($reqbody), empty($files) ? false : true); /* log request */ if ($this->_log) { $this->logHttpStream($url, $reqhead, $reqbody); } /* parse URL and convert to local variables: $scheme, $host, $path */ $parts = parse_url($url); if (!$parts) { die("Invalid URL!\n"); } else { foreach($parts as $key=>$val) $$key = $val; } /* open connection */ if ($this->_useproxy) { $fp = @fsockopen($this->_proxy_host, $this->_proxy_port); } else { $fp = @fsockopen(($scheme=='https' ? "ssl://$host" : $host), $scheme == 'https' ? 443 : 80); } /* always check */ if (!$fp) { die("Cannot connect to $host!\n"); } /* send request & read response */ @fputs($fp, $reqhead.$reqbody); for($res=''; !feof($fp); $res.=@fgets($fp, 4096)) {} fclose($fp); /* set delay between requests. behave! */ sleep($this->_delay); /* transferred bytes */ $this->_bytes += (strlen($reqhead)+ strlen($reqbody)+ strlen($res)); /* get response header & body */ list($reshead, $resbody) = explode("\r\n\r\n", $res, 2); /* convert header to associative array */ $head = $this->parseHead($reshead); /* return immediately if HEAD */ if ($method == 'HEAD') { if ($this->_log) $this->logHttpStream($url, $reshead, null); return $head; } /* cookies */ if (!empty($head['Set-Cookie'])) { $this->saveCookies($head['Set-Cookie'], $url); } /* referer */ if ($head['Status']['Code'] == 200) { $this->_referer = $url; } /* transfer-encoding: chunked */ if ($head['Transfer-Encoding'] == 'chunked') { $body = $this->joinChunks($resbody); } else { $body = $resbody; } /* content-encoding: gzip */ if ($head['Content-Encoding'] == 'gzip') { @file_put_contents('tmp.gz', $body); $fp = @gzopen('tmp.gz', 'r'); for($body = ''; !@gzeof($fp); $body.=@gzgets($fp, 4096)) {} @gzclose($fp); } /* log response */ if ($this->_log) { $this->logHttpStream($url, $reshead, $body); } /* cache body */ array_unshift($this->_body, $body); /* redirects: 302 */ if (isset($head['Location']) && $maxredir > 0) { $this->fetch($this->getAbsUrl($head['Location'], $url), 'GET', $maxredir--); } /* parse meta tags */ $meta = $this->parseMetaTags($body); /* redirects: <meta http-equiv=refresh...> */ if (isset($meta['http-equiv']['refresh']) && $maxredir > 0) { list($delay, $loc) = explode(';', $meta['http-equiv']['refresh'], 2); $loc = substr(trim($loc), 4); if (!empty($loc) && $loc != $url) $this->fetch($this->getAbsUrl($loc, $url), 'GET', $maxredir--); } /* get body and clear cache */ $body = $this->_body[0]; for($i = 1; $i < count($this->_body); $i++) { unset($this->_body[$i]); } return $body; } /** * Build request header */ protected function getReqHead($url, $method, $bodylen = 0, $sendfile = true) { /* parse URL elements to local variables: $scheme, $host, $path, $query, $user, $pass */ $parts = parse_url($url); foreach($parts as $key=>$val) $$key = $val; /* setup path */ $path = empty($path) ? '/' : $path .(empty($query) ? '' : "?$query"); /* request header */ if ($this->_useproxy) { $head = "$method $url HTTP/1.1\r\nHost: $this->_proxy_host\r\n"; } else { $head = "$method $path HTTP/1.1\r\nHost: $host\r\n"; } /* cookies */ $head .= $this->getCookies($url); /* content-type */ if ($method == 'POST' && ($sendfile || $this->_multipart)) { $head .= "Content-Type: multipart/form-data; boundary=$this->_boundary\r\n"; } elseif ($method == 'POST') { $head .= "Content-Type: application/x-www-form-urlencoded\r\n"; } /* set the content length if POST */ if ($method == 'POST') { $head .= "Content-Length: $bodylen\r\n"; } /* basic authentication */ if (!$this->_useproxy && !empty($user) && !empty($pass)) { $head .= "Authorization: Basic ". base64_encode("$user:$pass")."\r\n"; } /* basic authentication for proxy */ if ($this->_useproxy && !empty($this->_proxy_user) && !empty($this->_proxy_pass)) { $head .= "Authorization: Basic ". base64_encode("$this->_proxy_user:$this->_proxy_pass")."\r\n"; } /* gzip */ if ($this->_usegzip) { $head .= "Accept-Encoding: gzip\r\n"; } /* make it like real browsers */ if (!empty($this->_user_agent)) { $head .= "User-Agent: $this->_user_agent\r\n"; } if (!empty($this->_referer)) { $head .= "Referer: $this->_referer\r\n"; } /* no pipelining yet */ $head .= "Connection: Close\r\n\r\n"; /* request header is ready */ return $head; } /** * Build request body */ protected function getReqBody($form = array(), $files = array()) { /* check for parameters */ if (empty($form) && empty($files)) return ''; $body = ''; $tmp = array(); /* only form available: x-www-urlencoded */ if (!empty($form) && empty($files) && !$this->_multipart) { foreach($form as $key=>$val) $tmp[] = $key .'='. urlencode($val); return implode('&', $tmp); } /* form */ foreach($form as $key=>$val) { $body .= "--$this->_boundary\r\nContent-Disposition: form-data; name=\"" . $key ."\"\r\n\r\n" . $val ."\r\n"; } /* files */ foreach($files as $key=>$val) { if (!file_exists($val)) continue; $body .= "--$this->_boundary\r\n" . "Content-Disposition: form-data; name=\"" . $key . "\"; filename=\"" . basename($val) . "\"\r\n" . "Content-Type: " . $this->getMimeType($val) . "\r\n\r\n" . file_get_contents($val) . "\r\n"; } /* request body is ready! */ return $body."--$this->_boundary--"; } /** * convert response header to associative array */ protected function parseHead($str) { $lines = explode("\r\n", $str); list($ver, $code, $msg) = explode(' ', array_shift($lines), 3); $stat = array('Version' => $ver, 'Code' => $code, 'Message' => $msg); $head = array('Status' => $stat); foreach($lines as $line) { list($key, $val) = explode(':', $line, 2); if ($key == 'Set-Cookie') { $head['Set-Cookie'][] = trim($val); } else { $head[$key] = trim($val); } } return $head; } /** * Read chunked pages */ protected function joinChunks($str) { $CRLF = "\r\n"; for($tmp = $str, $res = ''; !empty($tmp); $tmp = trim($tmp)) { if (($pos = strpos($tmp, $CRLF)) === false) return $str; $len = hexdec(substr($tmp, 0, $pos)); $res.= substr($tmp, $pos + strlen($CRLF), $len); $tmp = substr($tmp, $pos + strlen($CRLF) + $len); } return $res; } /** * Save cookies from server */ protected function saveCookies($set_cookies, $url) { foreach($set_cookies as $str) { $parts = explode(';', $str); /* extract cookie parts to local variables: $name, $value, $domain, $path, $expires, $secure, $httponly */ foreach($parts as $part) { list($key, $val) = explode('=', trim($part), 2); $k = strtolower($key); if ($k == 'secure' || $k == 'httponly') { $$k = true; } elseif ($k == 'domain' || $k == 'path' || $k == 'expires') { $$k = $val; } else { $name = $key; $value = $val; } } /* cookie's domain */ if (empty($domain)) { $domain = parse_url($url, PHP_URL_HOST); } /* cookie's path */ if (empty($path)) { $path = parse_url($url, PHP_URL_PATH); $path = preg_replace('#/[^/]*$#', '', $path); $path = empty($path) ? '/' : $path; } /* cookie's expire time */ if (!empty($expires)) { $expires = strtotime($expires); } /* setup cookie ID, a simple trick to add/update existing cookie and cleanup local variables later */ $id = md5("$domain;$path;$name"); /* add/update cookie */ $this->_cookies[$id] = array( 'domain' => substr_count($domain, '.') == 1 ? ".$domain" : $domain, 'path' => $path, 'expires' => $expires, 'name' => $name, 'value' => $value, 'secure' => $secure, 'httponly' => $httponly ); /* cleanup local variables */ foreach($this->_cookies[$id] as $key=>$val) unset($$key); } return true; } /** * Get cookies for URL */ protected function getCookies($url) { $tmp = array(); $res = array(); /* remove expired cookies first */ foreach($this->_cookies as $id=>$cookie) { if (empty($cookie['expires']) || $cookie['expires'] >= time()) { $tmp[$id] = $cookie; } } /* cookies ready */ $this->_cookies = $tmp; /* parse URL to local variables: $scheme, $host, $path, $query */ $parts = parse_url($url); foreach($parts as $key=>$val) $$key = $val; if (empty($path)) $path = '/'; /* get all cookies for this domain and path */ foreach($this->_cookies as $cookie) { $d = substr($host, -1 * strlen($cookie['domain'])); $p = substr($path, 0, strlen($cookie['path'])); if (($d == $cookie['domain'] || ".$d" == $cookie['domain']) && $p == $cookie['path']) { if ($cookie['secure'] == true && $scheme == 'http') { continue; } $res[] = $cookie['name'].'='.$cookie['value']; } } /* return the string for HTTP header */ return (empty($res) ? '' : 'Cookie: '.implode('; ', $res)."\r\n"); } /** * Convert relative URL to absolute URL */ protected function getAbsUrl($loc, $parent) { /* parameters is required */ if (empty($loc) && empty($parent)) return; $loc = str_replace('&amp;', '&', $loc); /* return if URL is abolute */ if (parse_url($loc, PHP_URL_SCHEME) != '') return $loc; /* handle anchors and query's part */ $c = substr($loc, 0, 1); if ($c == '#' || $c == '&') return "$parent$loc"; /* handle query string */ if ($c == '?') { $pos = strpos($parent, '?'); if ($pos !== false) $parent = substr($parent, 0, $pos); return "$parent$loc"; } /* parse URL and convert to local variables: $scheme, $host, $path */ $parts = parse_url($parent); foreach ($parts as $key=>$val) $$key = $val; /* remove non-directory part from path */ $path = preg_replace('#/[^/]*$#', '', $path); /* set path to '/' if empty */ $path = preg_match('#^/#', $loc) ? '/' : $path; /* dirty absolute URL */ $abs = "$host$path/$loc"; /* replace '//', '/./', '/foo/../' with '/' */ while($abs = preg_replace(array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#'), '/', $abs, -1, $count)) if (!$count) break; /* absolute URL */ return "$scheme://$abs"; } /** * Convert meta tags to associative array */ protected function parseMetaTags($html) { /* extract to </head> */ if (($pos = strpos(strtolower($html), '</head>')) === false) { return array(); } else { $head = substr($html, 0, $pos); } /* get page's title */ preg_match("/<title>(.+)<\/title>/siU", $head, $m); $meta = array('title' => $m[1]); /* get all <meta...> */ preg_match_all('/<meta\s+[^>]*name\s*=\s*[\'"][^>]+>/siU', $head, $m); foreach($m[0] as $row) { preg_match('/name\s*=\s*[\'"](.+)[\'"]/siU', $row, $key); preg_match('/content\s*=\s *[\'"](.+)[\'"]/siU', $row, $val); if (!empty($key[1]) && !empty($val[1])) $meta[$key[1]] = $val[1]; } /* get <meta http-equiv=refresh...> */ preg_match('/<meta[^>]+http-equiv\s*=\s*[\'"]?refresh[\'"]?[^>]+content\s*=\s*[\'"](.+)[\'"][^>]*>/siU', $head, $m); if (!empty($m[1])) { $meta['http-equiv']['refresh'] = preg_replace('/&#0?39;/', '', $m[1]); } return $meta; } /** * Convert form to associative array */ public function parseForm($name_or_id, $action = '', $str = '') { if (empty($str) && empty($this->_body[0])) return array(); $body = empty($str) ? $this->_body[0] : $str; /* extract the form */ $re = '(<form[^>]+(id|name)\s*=\s*(?(?=[\'"])[\'"]'.$name_or_id.'[\'"]|\b'.$name_or_id.'\b)[^>]*>.+<\/form>)'; if (!preg_match("/$re/siU", $body, $form)) return array(); /* check if enctype=multipart/form-data */ if (preg_match('/<form[^>]+enctype[^>]+multipart\/form-data[^>]*>/siU', $form[1], $a)) $this->_multipart = true; else $this->_multipart = false; /* get form's action */ preg_match('/<form[^>]+action\s*=\s*(?(?=[\'"])[\'"]([^\'"]+)[\'"]|([^>\s]+))[^>]*>/si', $form[1], $a); $action = empty($a[1]) ? html_entity_decode($a[2]) : html_entity_decode($a[1]); /* select all <select..> with default values */ $re = '<select[^>]+name\s*=\s*(?(?=[\'"])[\'"]([^>]+)[\'"]|\b([^>]+)\b)[^>]*>' . '.+value\s*=\s*(?(?=[\'"])[\'"]([^>]+)[\'"]|\b([^>]+)\b)[^>]+\bselected\b' . '.+<\/select>'; preg_match_all("/$re/siU", $form[1], $a); foreach($a[1] as $num=>$key) { $val = $a[3][$num]; if ($val == '') $val = $a[4][$num]; if ($key == '') $key = $a[2][$num]; $res[$key] = html_entity_decode($val); } /* get all <input...> */ preg_match_all('/<input([^>]+)\/?>/siU', $form[1], $a); /* convert to associative array */ foreach($a[1] as $b) { preg_match_all('/([a-z]+)\s*=\s*(?(?=[\'"])[\'"]([^"]+)[\'"]|\b(.+)\b)/siU', trim($b), $c); $element = array(); foreach($c[1] as $num=>$key) { $val = $c[2][$num]; if ($val == '') $val = $c[3][$num]; $element[$key] = $val; } $type = strtolower($element['type']); /* only radio or checkbox with default values */ if ($type == 'radio' || $type == 'checkbox') if (!preg_match('/\s+\bchecked\b/', $b)) continue; /* remove buttons and file */ if ($type == 'file' || $type == 'submit' || $type == 'reset' || $type == 'button') continue; /* remove unnamed elements */ if ($element['name'] == '' && $element['id'] == '') continue; /* cool */ $key = $element['name'] == '' ? $element['id'] : $element['name']; $res[$key] = html_entity_decode($element['value']); } return $res; } /** * Get mime type for a file */ protected function getMimeType($filename) { /* list of mime type. add more rows to suit your need */ $mimetypes = array( 'jpg' => 'image/jpeg', 'jpe' => 'image/jpeg', 'jpeg' => 'image/jpeg', 'gif' => 'image/gif', 'png' => 'image/png', 'tiff' => 'image/tiff', 'html' => 'text/html', 'txt' => 'text/plain', 'pdf' => 'application/pdf', 'zip' => 'application/zip' ); /* get file extension */ preg_match('#\.([^\.]+)$#', strtolower($filename), $e); /* get mime type */ foreach($mimetypes as $ext=>$mime) if ($e[1] == $ext) return $mime; /* this is the default mime type */ return 'application/octet-stream'; } /** * Log HTTP request/response */ protected function logHttpStream($url, $head, $body) { /* open log file */ if (($fp = @fopen("$this->_debugdir/headers.txt", 'a')) == false) return; /* get method */ $m = substr($head, 0, 4); /* append the requested URL for HEAD, GET and POST */ if ($m == 'HEAD' || $m == 'GET ' || $m == 'POST') $head = str_repeat('-', 90) . "\r\n$url\r\n\r\n" . trim($head); /* header */ @fputs($fp, trim($head)."\r\n\r\n"); /* request body */ if ($m == 'POST' && strpos($head, 'Content-Length: ') !== false) { /* skip binary contents */ $find = 'Content-Type: \s*([^\s]+)\r\n\r\n(.+)\r\n'; $repl = "Content-Type: $1\r\n\r\n <... File contents ...>\r\n"; $body = preg_replace('/'.$find .'/siU', $repl, $body); @fputs($fp, "$body\r\n\r\n"); } /* response body */ if (substr($head, 0, 7) == 'HTTP/1.' && strpos($head, 'text/html') !== false && !empty($body)) { $tmp = "$this->_debugdir/" . $this->_debugnum++ . '.html'; @file_put_contents($tmp, $body); @fputs($fp, "<... See page contents in $tmp ...>\r\n\r\n"); } @fclose($fp); } public function setDebug($bool) { $this->_log = $bool; if (!$this->_log) return; /* create directory */ if (!is_dir($this->_debugdir)) { mkdir($this->_debugdir); chmod($this->_debugdir, 0644); } /* empty debug directory */ $items = scandir($this->_debugdir); foreach($items as $item) { if ($item == '.' || $item == '..') continue; unlink("$this->_debugdir/$item"); } } /** * Set proxy */ public function setProxy($host, $port, $user = '', $pass = '') { $this->_proxy_host = $host; $this->_proxy_port = $port; $this->_proxy_user = $user; $this->_proxy_pass = $pass; $this->_useproxy = true; } /** * Set delay between requests */ public function setInterval($sec) { if (!preg_match('/^\d+$/', $sec) || $sec <= 0) { $this->_delay = 1; } else { $this->_delay = $sec; } } /** * Assign a name for this HTTP client */ public function setUserAgent($ua) { $this->_user_agent = $ua; } }