<?php
/** * UrlTool Class: * A class to parse, validate, encode, and check url status. * * @version 1.1 * @author Hossamzee (hossam_zee@yahoo.com). * @date 7 Aug 2012. */
class UrlTool { /** * Parses a url and gets the components of it. * * @param string $url Url to be parsed. * @param string If there is an error, it then is filled in this variable (passed-by-reference). * @return mixed Array of components of the url if it is validated, or false. */ public /* mixed */ function parseUrl($url, &$error = "") { /* Initialize the components array. */ $components = array(); /* Push url to components array. */ $components["url"] = $url; /* Initialize variables. */ $scheme = null; $ipversion = null; $authority = null; $hostRequest = null; $host = null; $port = null; $hostname = null; $request = null; $path = null; $querystring = null; $fragment = null; /* Get the scheme of the url. */ if (preg_match("/^([A-Z][A-Z0-9\+\-\.]+):\/\//i", $url) > 0) { $colonDoubleSlashesPos = strpos($url, "://"); $scheme = substr($url, 0, $colonDoubleSlashesPos); $hostRequest = substr($url, $colonDoubleSlashesPos+3); } else { /* PREVIOUS: $scheme = null; */ $hostRequest = $url; } /* Get the host and the request and split them apart. */ $slashPos = strpos($hostRequest, '/');
if ($slashPos !== false) { $host = substr($hostRequest, 0, $slashPos); $request = substr($hostRequest, $slashPos+1); } else { $host = $hostRequest; $request = null; } /* Get authority from host. */ $atPos = strpos($host, '@'); if ($atPos !== false) { $authority = substr($host, 0, $atPos); $host = substr($host, $atPos+1); } else { $authority = null; } /* If the ip-version (of the host) is IPv6. */ if ($host{0} == '[') { $squareBracketColonPos = strpos($host, "]:"); if ($squareBracketColonPos !== false) { $hostname = substr($host, 0, $squareBracketColonPos+1); $port = substr($host, $squareBracketColonPos+2); } else { $hostname = $host; $port = null; } /* Set the ip version to 6. */ $ipversion = 6; }
/* If the ip-version is IPv4. */ else { $colonPos = strpos($host, ':'); if ($colonPos !== false) { $hostname = substr($host, 0, $colonPos); $port = substr($host, $colonPos+1); } else { $hostname = $host; $port = null; } /* Set the ip version to be 4. */ $ipversion = 4; } /* Strip dot from hostname. */ if ($hostname{strlen($hostname)-1} == '.') { $hostname = substr($hostname, 0, -1); } /* Set the path to be request, initially. */ $path = $request; /* Get the fragment of the url. */ $hashPos = strpos($path, '#'); if ($hashPos !== false) { $fragment = substr($path, $hashPos+1); $path = substr($path, 0, $hashPos); } /* Get the query string of the url. */ $questionMarkPos = strpos($path, '?'); if ($questionMarkPos !== false) { $querystring = substr($path, $questionMarkPos+1); $path = substr($path, 0, $questionMarkPos); } /* Push results to components. */ $components["scheme"] = $scheme; $components["ipversion"] = $ipversion; $components["authority"] = $authority; $components["port"] = $port; $components["hostname"] = $hostname; $components["request"] = $request; $components["path"] = $path; $components["querystring"] = $querystring; $components["fragment"] = $fragment; /* Validate the url components. */ if ($this->validateUrlComponents($components, $error) === false) { /* If the url is not valid. */ return false; } else { /* If the url is valid. */ return $components; } }
/** * Validates url components. * * @param array Components of the url (passed-by-reference). * @param string If there is an error, it then is filled in this variable (passed-by-reference). * @return bool True if the url components are valid, false otherwise. */ private /* bool */ function validateUrlComponents(&$components = array(), &$error = "") { /* Validate the scheme of the url. */ if ($components["scheme"] != null) { if (preg_match("/([A-Z][A-Z0-9\+\-\.]+)/i", $components["scheme"]) == 0) { /* If the scheme did not match the pattern. */ $error = "The scheme did not match the pattern ({$components["scheme"]})."; return false; } } else { /* If the scheme is empty. */ $components["scheme"] = "http"; } /* Validate the port if there is any. */ if ($components["port"] != null) { if (!is_numeric($components["port"])) { /* If the port is not a number. */ $error = "The port is not a number ({$components["port"]})."; return false; } } else { //$components["port"] = getservbyname($components["scheme"], "tcp"); } /* Validate the hostname. */ if ($components["hostname"] == "") { /* If the hostname is empty (mandatory variable). */ $error = "The hostname is empty (mandatory variable)."; return false; } /* Validate the . */ if ($components["authority"] != null && $components["authority"] != "") { preg_match("/((%[0-9A-F]{2})|([0-9A-Z|'~!$&*()_+=;:.,-]))*/i", $components["authority"], $authorityMatches); /* Check the difference between the two strings. */ $authorityDiff = str_replace($authorityMatches[0], '', $components["authority"]); if ($authorityDiff != "") { $wrongSymbol = $authorityDiff{0}; $error = "Wrong symbol used in authority ($wrongSymbol)."; return false; } }
/* Split the domain parts. */ $domain_parts = explode(".", $components["hostname"]);
/* If the host name is like (.com, .net) */ if ($domain_parts[0] == "" || $domain_parts[1] == "") { $error = "The hostname does not look like hostname."; return false; }
/* Validate that the hostname is ipv6. */ if ($components["ipversion"] == 6) { $hostnameWithoutSquareBrackets = substr($components["hostname"], 1, -1); /* Validate the syntax of ip version future. */ if (preg_match("/v[0-9A-F]+\.[A-Z0-9\-\.\_\~\!\$\&\'\(\)\*\+\,\;\=]+/i", $hostnameWithoutSquareBrackets)) { /* If the ipvfuture is value, e.g. .*/ $components["ipversion"] = "future"; return true; } /* Validate the syntax of ipv6. */ /* Source: http://crisp.tweakblogs.net/blog/2031 */ if (preg_match("/^(?:[a-f0-9]{1,4}(?::[a-f0-9]{1,4})*|[a-f0-9]{1,4}(?::[a-f0-9]{1,4})*::(?:[a-f0-9]{1,4}(?::[a-f0-9]{1,4})*)? |::(?:[a-f0-9]{1,4}(?::[a-f0-9]{1,4})*)?)(?::\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})?$/ix", $hostnameWithoutSquareBrackets, $match) > 0) { /* If the ipv6 is valid, e.g. http://[fe80:0:0:0:202:b3ff:fe1e:8329]. */ return true; } else { /* If the hostname is not valid as an ipv6. */ $error = "The hostname is not valid as an ipv6 ({$hostnameWithoutSquareBrackets})."; return false; } } /* Validate that the hostname is ipv4. */ if ($components["ipversion"] == 4) { if (strpos($components["hostname"], '.') !== false) { /* Validate that the hostname is an ip. */ if (preg_match("/^\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b$/", $components["hostname"]) > 0) { /* If the hostname is a valid ip address. */ return true; } else { /* It might be a regular hostname. */ if (preg_match("/[\:\/\?\#\[\]\@\s]+/", $components["hostname"])) { $error = "The hostname is not valid."; return false; } else { /* If the hostname without TLD is valid. */ $components["ipversion"] = "reg-name"; return true; } } } else { /* If the hostname did not contain a dot '.'. */ $error = "The hostname did not contain a dot ({$components["hostname"]})."; return false; } } } /** * Checks if the url exists or not (not-in-use). * * @param string Url to be checked. * @param float Time taken to response (passed-by-reference). * @return bool True if the url exists, false otherwise. */ public /* bool */ function checkUrl($url, &$responseTime) { /* Set the request method to be head. */ stream_context_set_default(array("http" => array("method" => "HEAD", "max_redirects" => 1))); /* Set start time. */ $startTime = array_sum(explode(" ", microtime())); /* Send a head request. */ $headers = get_headers($url); /* Set finish time. */ $finishTime = array_sum(explode(" ", microtime())); /* Get HTTP response code. */ preg_match("/HTTP\/\d\.\d (\d{3})/i", $headers[0], $responseArray); /* Set the response time. */ $responseTime = $finishTime - $startTime; /* Return true, if the url is not 404, else, otherwise. */ return ($responseArray[1] != 404); } /** * Encodes a normal domain name (Unicode/UTF-8) to Punycode (to-do). * @param string Domain name (UTF-8). * @return string Punycode of the domain. */ public /* string */ function domainToPunycode($domain) { return ""; }
/** * Normalize URL to be in this format: scheme://[authority@]hostname[:port]/[request] * @param array URL components. * @return string Normalized URL. */ public function normalizeUrl($urlComponents) { // Scheme $normalizedUrl = $urlComponents["scheme"] . "://";
// Authority? $normalizedUrl .= ($urlComponents["authority"] != null && $urlComponents["authority"] != "") ? $urlComponents["authority"] . "@" : "";
// Hostname $normalizedUrl .= $urlComponents["hostname"];
// Port $normalizedUrl .= ($urlComponents["port"] != null && $urlComponents["port"] != "") ? ":" . $urlComponents["port"] : "";
// Request $normalizedUrl .= "/" . $urlComponents["request"];
return $normalizedUrl; } }
|