Login   Register  
PHP Classes
elePHPant
Icontem

File: class.html_transform.php

Recommend this page to a friend!
Stumble It! Stumble It! Bookmark in del.icio.us Bookmark in del.icio.us
  Classes of Keyvan Minoukadeh  >  HTTP Navigator  >  class.html_transform.php  >  Download  
File: class.html_transform.php
Role: ???
Content type: text/plain
Description: Simple example class for modifying HTML
Class: HTTP Navigator
Web fetching
Author: By
Last change:
Date: 2002-04-23 02:05
Size: 7,147 bytes
 

Contents

Class file image Download
<?php
// tab/indent == 4 spaces

// $Date: 2002/04/21 03:08:15 $
// $Revision: 1.4 $

/**
* HTML Transform class
*
* PHP class to modify HTML content.
*
* Note: I've quickly put this together for the
*       http_navigator example, it's not complete
*       and will change in future.
*
* @author   Keyvan Minoukadeh <keyvan@k1m.com>
* @version  0.1.2 alpha
*/
class html_transform
{
	var $html_orig;
	var $html;

	var $base_url			= "";
	var $pass_through		= "";

	var $use_base_href		= true;
	var $rewrite_url		= true;

	// these vars would hold form details and frame details
	// (to be added when I get some more time)
	var $form				= array();
	var $frame				= array();


	/**
	* CONSTRUCTOR
	*
	* @param	string	$html_orig	html content
	* @param	string	$base_url	base url to rewrite relative urls with
	*/
	function html_transform($html_orig="", $base_url="")
	{
		$this->html_orig = trim($html_orig);
		$this->html	= trim($html_orig);
		$this->set_base_url($base_url);
	}

	/**
	* Use base href
	*
	* If <base href..> tag found, use it as base url? (default: yes)
	*
	* @param	bool	$use_base_href
	*/
	function use_base_href($use_base_href=true)
	{
		$this->use_base_href = $use_base_href;
		return true;
	}

	/**
	* Set base URL
	*
	* This URL will be prepended to all relative URLs found
	*
	* @param	string	$base_url	prepend this URL to all relative paths
	*/
	function set_base_url($base_url)
	{
		$base_url = trim($base_url);
		if ((strtolower(substr($base_url, 0, 7)) == "http://") || (strtolower(substr($base_url, 0, 8)) == "https://")) {
			$this->base_url = $base_url;
			if (substr_count($this->base_url, "/") > 2) {
				$this->base_url = substr($this->base_url, 0, (strrpos($this->base_url, "/")+1));
			} else {
				$this->base_url = $this->base_url."/";
			}
		} else {
			return false;
		}
		return true;
	}

	/**
	* Set pass through URL
	*
	* Should point to a script which can process the page, eg.
	* http://www.example.com/process.php?url=
	*
	* @param	string	$pass_through	All URLs will pass through this script
	*/
	function set_pass_through($pass_through)
	{
		$this->pass_through = $pass_through;
		return true;
	}

	/**
	* Count string
	*
	* Returns number of times string found in html
	*
	* @param	mixed	$string		string or array containing strings to find
	* @param	bool	$strip_tags	strip html tags before counting?
	* @return	mixed				int containing number of matches, or associative array containing
	*								subject as key and matches as value
	*/
	function count_string($string, $strip_tags=false)
	{
		$content = (($strip_tags) ? strip_tags($this->html) : $this->html);
		if (is_array($string)) {
			$found = array();
			foreach ($string as $val) {
				$found["$val"] = substr_count($content, $val);
			}
			return $found;
		} else {
			return substr_count($content, $string);
		}
	}

	/**
	* Count word
	*
	* Returns number of times word (regex word boundary used) is found in html
	*
	* @param	mixed	$word		string or array containing words to find
	* @param	bool	$strip_tags	strip html tags before counting?
	* @return	mixed				int containing number of matches, or associative array containing
	*								subject as key and matches as value
	*/
	function count_word($word, $strip_tags=false)
	{
		$content = (($strip_tags) ? strip_tags($this->html) : $this->html);
		if (is_array($word)) {
			$found = array();
			foreach ($word as $val) {
				$val = trim($val);
				if (preg_match_all("/\\b".preg_quote($val)."\\b/i", $content, $matches)) {
					$found["$val"] = count($matches[0]);
					unset($matches);
				} else {
					$found["$val"] = 0;
				}
			}
			return $found;
		} else {
			if (preg_match_all("/\\b".preg_quote($word)."\\b/i", $content, $matches)) {
				return count($matches[0]);
			} else {
				return 0;
			}
		}
	}

	/**
	* Search and replace
	*
	* Search html file for 1st argument, replace with 2nd argument
	*
	* @param	mixed	$search		string or array containing strings to find
	* @param	mixed	$replace	string or array containing replacement string
	*/
	function search_replace($search, $replace)
	{
		$this->html = str_replace($search, $replace, $this->html);
		return true;
	}


	/**
	* Process html
	*/
	function process()
	{
		if ($this->use_base_href &&
				preg_match("!<base.+?href\\s*=\\s*[\"']?(http://[^\"'>]+?)[\"']?.*?".">!i", $this->html, $matches)) {
			$this->set_base_url(trim($matches[1]));
		}
		if ($this->rewrite_url) {
			$this->html = preg_replace_callback('/<(a|area|img|link|frame|iframe|input)(.*?)(src|href)(\s?=\s?)("|\')?([^> \'"]+)/is',
								array($this, 'rewrite_url'), $this->html);
		}
	}

	/**
	* Rewrite URL
	*/
	function rewrite_url(&$modify)
	{
		$url = trim($modify[6]);
		$pre = "<".$modify[1].$modify[2].$modify[3].$modify[4].$modify[5];
		$post = "";
		$ret = $url;
		if ($url_split = @parse_url($url)) {

			// if mailto link, return unchanged
			if (strtolower(substr($url, 0, 7)) == "mailto:") {
				return $modify[0];
			}

			// if scheme included
			if (isset($url_split["scheme"])) {
				if (strtolower($url_split["scheme"]) == "http") {
					$ret = $url;
					if (in_array(strtolower($modify[1]), array("a","frame","area","iframe")))
						$ret = $this->add_passthrough($ret);
					return $pre.$ret.$post;
				} else {
					// return unchanged
					return $modify[0];
				}
			}

			// if fragment only (#??)
			if (substr($url, 0, 1) == "#") {
				// return unchanged
				return $modify[0];
			}

			// if scheme not included (relative)
			if (!empty($this->base_url)) {
				if (isset($url_split["path"])) {
					if (substr($url_split["path"], 0, 1) == "/") {
						$base_split = @parse_url($this->base_url);
						$ret = "http://".$base_split["host"].(isset($base_split["port"]) ? ":".$base_split["port"] : "").$url_split["path"];
						$ret .= (!empty($url_split["query"]) ? "?".$url_split["query"] : "");
						if (in_array(strtolower($modify[1]), array("a","frame","area","iframe")))
							$ret = $this->add_passthrough($ret);
						return $pre.$ret.(isset($url_split["fragment"]) ? "#".$url_split["fragment"] : "").$post;
					} else {
						$ret = $this->base_url.$url_split["path"].(!empty($url_split["query"]) ? "?".$url_split["query"] : "");
						if (in_array(strtolower($modify[1]), array("a","frame","area","iframe")))
							$ret = $this->add_passthrough($ret);
						return $pre.$ret.(isset($url_split["fragment"]) ? "#".$url_split["fragment"] : "").$post;
					}
				}
			}
		}
		// return unchanged
		return $modify[0];
	}

	/**
	* Add passthrough
	*/
	function add_passthrough($url)
	{
		if (!empty($this->pass_through)) {
			return $this->pass_through.urlencode(trim($url));
		} else {
			return $url;
		}
	}

	/**
	* Add base tag
	*/
	function add_base($url)
	{
		$this->html = preg_replace('/<base[^>]+>/i', '', $this->html);
		$this->html = preg_replace('/<html[^>]*>/i', '<html><base href="'.$url.'" />', $this->html, 1);
		return true;
	}


}

?>