<?php
/**
* Class UrlExtractor
*
* PHP version 5
*
* @category Utilities
* @package UrlExtractor
* @author Joao Ribeiro <joaopedrocr@gmail.com>
* @license http://www.gnu.org/copyleft/gpl.html GNU General Public License
* @link http://urlextractor.joaoperibeiro.com
*/
namespace rollbackpt\UrlExtractor;
/**
* PHP Class to extract images and meta data information from URLs.
*
* @category Utilities
* @package UrlExtractor
* @author Joao Ribeiro <joaopedrocr@gmail.com>
* @license http://www.gnu.org/copyleft/gpl.html GNU General Public License
* @link http://urlextractor.joaoperibeiro.com
*
* @TODO: Extract thumbnails from videos
* @TODO: Split the code into smaller classes (One to handle meta tags
* and other to handle images and thumbnails)
* @TODO: Change get_meta_tags to Regex
* @TODO: Add method to get existing extracted data
*/
class UrlExtractor
{
/*
* Use this const to define if you want to use curl or file_get_contents to
* get the url contents
*/
const CURL = true;
/**
* URL passed as a parameter in construct
*
* @var string $url
*/
protected $url;
/**
* Host extracted from the URL
*
* @var string $host
*/
protected $host;
/**
* Array to store all the images extracted from the URL
*
* @var array $images
*/
public $images = array();
/**
* Title extracted from the URL
*
* @var string $title
*/
public $title;
/**
* Description extracted from the URL
*
* @var string $description
*/
public $description;
/**
* Array to store the keywords extracted from the URL
*
* @var array $keywords
*/
public $keywords = array();
/**
* Array containing the name of the meta tags to be extracted
*
* @var array $metaTagNames
*/
protected $metaTagNames = array(
'title' => array(
'twitter:title',
'og:title'
),
'description' => array(
'description',
'twitter:description',
'og:description'
),
'keywords' => array(
'keywords'
),
'images' => array(
'twitter:image',
'twitter:image:src',
'og:image'
)
);
/**
* Class contructor.
*
* @return void
*/
public function __construct()
{
// Empty constructor for future implementations
}
/**
* Function extractAll
*
* Extract all the elements from the URL
* (title, description, keywords and images)
*
* @param string $url
* @param boolean $json Define if the result is returned in an array or a
* json string.
*
* @return array|string Return an array or JSON string with the url info
* (title, description, keywords and images) or and error message
*/
public function extractAll($url, $json = true)
{
// Check the url parameter
if (!empty($url)) {
$this->url = $url;
} else {
throw new Exception("URL can\'t be empty!");
}
// Clean variables from old calls
$this->host = "";
$this->title = "";
$this->description = "";
$this->keywords = array();
$this->images = array();
// Get the url contents for extraction
if (self::CURL) {
$urlContent = $this->curlGetContents($this->url);
} else {
$urlContent = @file_get_contents($this->url);
}
// Avoid errors in the Regex matcher because of glued metatags
$urlContent = str_replace("<meta", "\n<meta", $urlContent);
if ($urlContent !== false) {
$this->getHost($this->url);
$this->getMetaTagsByProperty($urlContent);
$this->getPageTitle($urlContent);
$this->getMetaTagsByName($urlContent);
$this->getImages($urlContent);
$urlInfo = array(
'title' => $this->title,
'description' => $this->description,
// Before assign, remove duplicate images and reorder the array
'keywords' => array_values(array_unique($this->keywords)),
// Before assign, remove duplicate images and reorder the array
'images' => array_values(array_unique($this->images))
);
return ($json) ? json_encode($urlInfo) : $urlInfo;
}
return ($json) ? json_encode(array('error' => 'Invalid URL')) :
array('error' => 'Invalid URL');
}
/**
* Function getHost
*
* Get the host from the URL (Ex: http://localhost.com
* is the host extracted from http://localhost.com/test/index.php)
*
* @param string $url
*
* @return void
*/
protected function getHost($url)
{
$pattern = '/([^:]*:\/\/)?([^\/]*\.)*([^\/\.]+\.[^\/]+)/i';
preg_match($pattern, $url, $results);
$this->host = $results[0];
}
/**
* Function getPageTitle
*
* Get the text inside the title tag
*
* @param string $urlContent Page content to get the title from
*
* @return void
*/
protected function getPageTitle($urlContent)
{
$this->title = $this->getText($urlContent, "<title>", "</title>");
}
/**
* Function getMetaTagsByName
*
* Get the regular meta tags (Description, keywords, etc..)
*
* @param string $urlContent Url content to get meta tags from
*
* @return void
*/
protected function getMetaTagsByName($urlContent)
{
$pattern = '/<meta.*?name=["|\'](description|keywords)["|\'][^<]*?content=["|\'](.*?)["|\'].*?>|<meta.*?content=["|\'](.*?)["|\'][^<]*?name=["|\'](description|keywords)["|\'].*?>/i';
preg_match_all($pattern, $urlContent, $results);
$metaTags = $this->formatMetaTagsArray($results);
if ($metaTags !== false) {
$this->setUrlAtributes($metaTags);
}
}
/**
* Function getMetaTagsByProperty
*
* Get property meta tags like open graph for example
* (Ex: <meta property="og:title" content="The Rock" />)
*
* @param string $urlContent Url content to get meta tags from
*
* @return void
*/
protected function getMetaTagsByProperty($urlContent)
{
$pattern = '/<meta.*?property=["|\'](.*?)["|\'][^<]*?content=["|\'](.*?)["|\'].*?>|<meta.*?content=["|\'](.*?)["|\'][^<]*?property=["|\'](.*?)["|\'].*?>/i';
preg_match_all($pattern, $urlContent, $results);
$metaTags = $this->formatMetaTagsArray($results);
if ($metaTags !== false) {
$this->setUrlAtributes($metaTags);
}
}
/**
* Function getImages
*
* Get the images from the URL
*
* @param string $urlContent
*
* @return void
*/
protected function getImages($urlContent)
{
$pattern = '/<img.*?src=["|\'](.*?)["|\'].*?>/i';
preg_match_all($pattern, $urlContent, $results);
foreach ($results[1] as $image) {
$image = $this->checkImageUrl($image);
if ($image !== null) {
array_push($this->images, $image);
}
}
}
/**
* Function setUrlAtributes
*
* Set the class atributes and overwrite duplicates
* (Ex: Description, Keywords)
*
* @param string $metaTags
*
* @return void
*/
protected function setUrlAtributes($metaTags)
{
foreach ($this->metaTagNames as $key => $name) {
foreach ($name as $value) {
if (array_key_exists($value, $metaTags)) {
if (is_array($this->$key)) {
if (!empty($metaTags[$value])) {
// Hard coded rule to split keywords by ","
if ($key == 'keywords') {
$metaTags[$value] = explode(",", $metaTags[$value]);
foreach ($metaTags[$value] as $v) {
array_push($this->$key, trim($v));
}
} else {
array_push($this->$key, $metaTags[$value]);
}
}
} else {
if (!empty($metaTags[$value])) {
$this->$key = $metaTags[$value];
}
}
}
}
}
}
/**
* Function formatMetaTagsArray
*
* Utility function used by getMetaTagsByProperty to
* properly format the meta tag array expected by
* setUrlAtributes
*
* @param array $array
*
* @return array|boolean Returns the array with the meta tags found or false
* in case of not founding any meta tags
*/
protected function formatMetaTagsArray($array)
{
$pattern = '/^(' . $this->getPropertyRuleString() . ')/i';
foreach ($array as $key => $value) {
if (preg_grep($pattern, $value)) {
if ($key%2 == 0) {
return array_combine($array[$key], $array[$key-1]);
} else {
return array_combine($array[$key], $array[$key+1]);
}
}
}
return false;
}
/**
* Function checkImageUrl
*
* Utility function used by getImages to check image URL
* and complete relative URLs
*
* @param string $url Url of the image to be checked
*
* @return string Image url
*/
protected function checkImageUrl($url)
{
$pattern = '/^[^(\.|\/)].*?[\.?].*?(.jpg|.gif|.png|.jpeg|.bmp)/i';
$pattern2 = '/(.jpg|.gif|.png|.jpeg|.bmp)$/i';
$url = preg_replace('/\.\.\//i', '', $url);
if (!preg_match($pattern, $url)) {
if (preg_match($pattern2, $url)) {
return ($url[0] === '/') ? $this->host . $url : $this->host . '/' . $url;
}
} else {
return $url;
}
}
/**
* Function getText
*
* Utility function that extract text between start and end points
*
* @param string $text
* @param string $start
* @param string $end
*
* @return string The text extracted
*/
protected function getText($text, $start, $end)
{
$a = explode($start, $text);
$b = explode($end, $a[1]);
return $b[0];
}
/**
* Function getPropertyRuleString
*
* Goes throug all the proprety names and generates a regex rule to match
* at least one of them
*
* @return string Proprety names concateneted with | to make a regex rule
*/
protected function getPropertyRuleString()
{
$string = "";
foreach ($this->metaTagNames as $type) {
foreach ($type as $tag) {
$string .= $tag . "|";
}
}
return trim($string, "|");
}
/**
* Function curlGetContents
*
* Same as file_get_contents but using curl to avoid getting error 403
* forbidden because the request doesn't have a valid user agent
*
* @param string $url Url to get the contents from using Curl
*
* @return string $output Contents obtained from the url
*/
protected function curlGetContents($url)
{
// create curl resource
$ch = curl_init();
// set url
curl_setopt($ch, CURLOPT_URL, $url);
//return the transfer as a string
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13');
// $output contains the output string
$output = curl_exec($ch);
// close curl resource to free up system resources
curl_close($ch);
return $output;
}
}
|