<?php
/**
* UrlTool Class:
* A class to parse, validate, encode, and check url status.
*
* @version 1.1
* @author Hossamzee (hossam_zee@yahoo.com).
* @date 7 Aug 2012.
*/
class UrlTool
{
/**
* Parses a url and gets the components of it.
*
* @param string $url Url to be parsed.
* @param string If there is an error, it then is filled in this variable (passed-by-reference).
* @return mixed Array of components of the url if it is validated, or false.
*/
public /* mixed */ function parseUrl($url, &$error = "")
{
/* Initialize the components array. */
$components = array();
/* Push url to components array. */
$components["url"] = $url;
/* Initialize variables. */
$scheme = null;
$ipversion = null;
$authority = null;
$hostRequest = null;
$host = null;
$port = null;
$hostname = null;
$request = null;
$path = null;
$querystring = null;
$fragment = null;
/* Get the scheme of the url. */
if (preg_match("/^([A-Z][A-Z0-9\+\-\.]+):\/\//i", $url) > 0)
{
$colonDoubleSlashesPos = strpos($url, "://");
$scheme = substr($url, 0, $colonDoubleSlashesPos);
$hostRequest = substr($url, $colonDoubleSlashesPos+3);
}
else
{
/* PREVIOUS: $scheme = null; */
$hostRequest = $url;
}
/* Get the host and the request and split them apart. */
$slashPos = strpos($hostRequest, '/');
if ($slashPos !== false)
{
$host = substr($hostRequest, 0, $slashPos);
$request = substr($hostRequest, $slashPos+1);
}
else
{
$host = $hostRequest;
$request = null;
}
/* Get authority from host. */
$atPos = strpos($host, '@');
if ($atPos !== false)
{
$authority = substr($host, 0, $atPos);
$host = substr($host, $atPos+1);
}
else
{
$authority = null;
}
/* If the ip-version (of the host) is IPv6. */
if ($host{0} == '[')
{
$squareBracketColonPos = strpos($host, "]:");
if ($squareBracketColonPos !== false)
{
$hostname = substr($host, 0, $squareBracketColonPos+1);
$port = substr($host, $squareBracketColonPos+2);
}
else
{
$hostname = $host;
$port = null;
}
/* Set the ip version to 6. */
$ipversion = 6;
}
/* If the ip-version is IPv4. */
else
{
$colonPos = strpos($host, ':');
if ($colonPos !== false)
{
$hostname = substr($host, 0, $colonPos);
$port = substr($host, $colonPos+1);
}
else
{
$hostname = $host;
$port = null;
}
/* Set the ip version to be 4. */
$ipversion = 4;
}
/* Strip dot from hostname. */
if ($hostname{strlen($hostname)-1} == '.')
{
$hostname = substr($hostname, 0, -1);
}
/* Set the path to be request, initially. */
$path = $request;
/* Get the fragment of the url. */
$hashPos = strpos($path, '#');
if ($hashPos !== false)
{
$fragment = substr($path, $hashPos+1);
$path = substr($path, 0, $hashPos);
}
/* Get the query string of the url. */
$questionMarkPos = strpos($path, '?');
if ($questionMarkPos !== false)
{
$querystring = substr($path, $questionMarkPos+1);
$path = substr($path, 0, $questionMarkPos);
}
/* Push results to components. */
$components["scheme"] = $scheme;
$components["ipversion"] = $ipversion;
$components["authority"] = $authority;
$components["port"] = $port;
$components["hostname"] = $hostname;
$components["request"] = $request;
$components["path"] = $path;
$components["querystring"] = $querystring;
$components["fragment"] = $fragment;
/* Validate the url components. */
if ($this->validateUrlComponents($components, $error) === false)
{
/* If the url is not valid. */
return false;
}
else
{
/* If the url is valid. */
return $components;
}
}
/**
* Validates url components.
*
* @param array Components of the url (passed-by-reference).
* @param string If there is an error, it then is filled in this variable (passed-by-reference).
* @return bool True if the url components are valid, false otherwise.
*/
private /* bool */ function validateUrlComponents(&$components = array(), &$error = "")
{
/* Validate the scheme of the url. */
if ($components["scheme"] != null)
{
if (preg_match("/([A-Z][A-Z0-9\+\-\.]+)/i", $components["scheme"]) == 0)
{
/* If the scheme did not match the pattern. */
$error = "The scheme did not match the pattern ({$components["scheme"]}).";
return false;
}
}
else
{
/* If the scheme is empty. */
$components["scheme"] = "http";
}
/* Validate the port if there is any. */
if ($components["port"] != null)
{
if (!is_numeric($components["port"]))
{
/* If the port is not a number. */
$error = "The port is not a number ({$components["port"]}).";
return false;
}
}
else
{
//$components["port"] = getservbyname($components["scheme"], "tcp");
}
/* Validate the hostname. */
if ($components["hostname"] == "")
{
/* If the hostname is empty (mandatory variable). */
$error = "The hostname is empty (mandatory variable).";
return false;
}
/* Validate the . */
if ($components["authority"] != null && $components["authority"] != "")
{
preg_match("/((%[0-9A-F]{2})|([0-9A-Z|'~!$&*()_+=;:.,-]))*/i", $components["authority"], $authorityMatches);
/* Check the difference between the two strings. */
$authorityDiff = str_replace($authorityMatches[0], '', $components["authority"]);
if ($authorityDiff != "")
{
$wrongSymbol = $authorityDiff{0};
$error = "Wrong symbol used in authority ($wrongSymbol).";
return false;
}
}
/* Split the domain parts. */
$domain_parts = explode(".", $components["hostname"]);
/* If the host name is like (.com, .net) */
if ($domain_parts[0] == "" || $domain_parts[1] == "")
{
$error = "The hostname does not look like hostname.";
return false;
}
/* Validate that the hostname is ipv6. */
if ($components["ipversion"] == 6)
{
$hostnameWithoutSquareBrackets = substr($components["hostname"], 1, -1);
/* Validate the syntax of ip version future. */
if (preg_match("/v[0-9A-F]+\.[A-Z0-9\-\.\_\~\!\$\&\'\(\)\*\+\,\;\=]+/i", $hostnameWithoutSquareBrackets))
{
/* If the ipvfuture is value, e.g. .*/
$components["ipversion"] = "future";
return true;
}
/* Validate the syntax of ipv6. */
/* Source: http://crisp.tweakblogs.net/blog/2031 */
if (preg_match("/^(?:[a-f0-9]{1,4}(?::[a-f0-9]{1,4})*|[a-f0-9]{1,4}(?::[a-f0-9]{1,4})*::(?:[a-f0-9]{1,4}(?::[a-f0-9]{1,4})*)?
|::(?:[a-f0-9]{1,4}(?::[a-f0-9]{1,4})*)?)(?::\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})?$/ix", $hostnameWithoutSquareBrackets, $match) > 0)
{
/* If the ipv6 is valid, e.g. http://[fe80:0:0:0:202:b3ff:fe1e:8329]. */
return true;
}
else
{
/* If the hostname is not valid as an ipv6. */
$error = "The hostname is not valid as an ipv6 ({$hostnameWithoutSquareBrackets}).";
return false;
}
}
/* Validate that the hostname is ipv4. */
if ($components["ipversion"] == 4)
{
if (strpos($components["hostname"], '.') !== false)
{
/* Validate that the hostname is an ip. */
if (preg_match("/^\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b$/", $components["hostname"]) > 0)
{
/* If the hostname is a valid ip address. */
return true;
}
else
{
/* It might be a regular hostname. */
if (preg_match("/[\:\/\?\#\[\]\@\s]+/", $components["hostname"]))
{
$error = "The hostname is not valid.";
return false;
}
else
{
/* If the hostname without TLD is valid. */
$components["ipversion"] = "reg-name";
return true;
}
}
}
else
{
/* If the hostname did not contain a dot '.'. */
$error = "The hostname did not contain a dot ({$components["hostname"]}).";
return false;
}
}
}
/**
* Checks if the url exists or not (not-in-use).
*
* @param string Url to be checked.
* @param float Time taken to response (passed-by-reference).
* @return bool True if the url exists, false otherwise.
*/
public /* bool */ function checkUrl($url, &$responseTime)
{
/* Set the request method to be head. */
stream_context_set_default(array("http" => array("method" => "HEAD", "max_redirects" => 1)));
/* Set start time. */
$startTime = array_sum(explode(" ", microtime()));
/* Send a head request. */
$headers = get_headers($url);
/* Set finish time. */
$finishTime = array_sum(explode(" ", microtime()));
/* Get HTTP response code. */
preg_match("/HTTP\/\d\.\d (\d{3})/i", $headers[0], $responseArray);
/* Set the response time. */
$responseTime = $finishTime - $startTime;
/* Return true, if the url is not 404, else, otherwise. */
return ($responseArray[1] != 404);
}
/**
* Encodes a normal domain name (Unicode/UTF-8) to Punycode (to-do).
* @param string Domain name (UTF-8).
* @return string Punycode of the domain.
*/
public /* string */ function domainToPunycode($domain)
{
return "";
}
/**
* Normalize URL to be in this format: scheme://[authority@]hostname[:port]/[request]
* @param array URL components.
* @return string Normalized URL.
*/
public function normalizeUrl($urlComponents)
{
// Scheme
$normalizedUrl = $urlComponents["scheme"] . "://";
// Authority?
$normalizedUrl .= ($urlComponents["authority"] != null && $urlComponents["authority"] != "") ? $urlComponents["authority"] . "@" : "";
// Hostname
$normalizedUrl .= $urlComponents["hostname"];
// Port
$normalizedUrl .= ($urlComponents["port"] != null && $urlComponents["port"] != "") ? ":" . $urlComponents["port"] : "";
// Request
$normalizedUrl .= "/" . $urlComponents["request"];
return $normalizedUrl;
}
}
|