<?php
// tab/indent == 4 spaces
// $Date: 2002/04/21 03:08:15 $
// $Revision: 1.4 $
/**
* HTML Transform class
*
* PHP class to modify HTML content.
*
* Note: I've quickly put this together for the
* http_navigator example, it's not complete
* and will change in future.
*
* @author Keyvan Minoukadeh <keyvan@k1m.com>
* @version 0.1.2 alpha
*/
class html_transform
{
var $html_orig;
var $html;
var $base_url = "";
var $pass_through = "";
var $use_base_href = true;
var $rewrite_url = true;
// these vars would hold form details and frame details
// (to be added when I get some more time)
var $form = array();
var $frame = array();
/**
* CONSTRUCTOR
*
* @param string $html_orig html content
* @param string $base_url base url to rewrite relative urls with
*/
function html_transform($html_orig="", $base_url="")
{
$this->html_orig = trim($html_orig);
$this->html = trim($html_orig);
$this->set_base_url($base_url);
}
/**
* Use base href
*
* If <base href..> tag found, use it as base url? (default: yes)
*
* @param bool $use_base_href
*/
function use_base_href($use_base_href=true)
{
$this->use_base_href = $use_base_href;
return true;
}
/**
* Set base URL
*
* This URL will be prepended to all relative URLs found
*
* @param string $base_url prepend this URL to all relative paths
*/
function set_base_url($base_url)
{
$base_url = trim($base_url);
if ((strtolower(substr($base_url, 0, 7)) == "http://") || (strtolower(substr($base_url, 0, 8)) == "https://")) {
$this->base_url = $base_url;
if (substr_count($this->base_url, "/") > 2) {
$this->base_url = substr($this->base_url, 0, (strrpos($this->base_url, "/")+1));
} else {
$this->base_url = $this->base_url."/";
}
} else {
return false;
}
return true;
}
/**
* Set pass through URL
*
* Should point to a script which can process the page, eg.
* http://www.example.com/process.php?url=
*
* @param string $pass_through All URLs will pass through this script
*/
function set_pass_through($pass_through)
{
$this->pass_through = $pass_through;
return true;
}
/**
* Count string
*
* Returns number of times string found in html
*
* @param mixed $string string or array containing strings to find
* @param bool $strip_tags strip html tags before counting?
* @return mixed int containing number of matches, or associative array containing
* subject as key and matches as value
*/
function count_string($string, $strip_tags=false)
{
$content = (($strip_tags) ? strip_tags($this->html) : $this->html);
if (is_array($string)) {
$found = array();
foreach ($string as $val) {
$found["$val"] = substr_count($content, $val);
}
return $found;
} else {
return substr_count($content, $string);
}
}
/**
* Count word
*
* Returns number of times word (regex word boundary used) is found in html
*
* @param mixed $word string or array containing words to find
* @param bool $strip_tags strip html tags before counting?
* @return mixed int containing number of matches, or associative array containing
* subject as key and matches as value
*/
function count_word($word, $strip_tags=false)
{
$content = (($strip_tags) ? strip_tags($this->html) : $this->html);
if (is_array($word)) {
$found = array();
foreach ($word as $val) {
$val = trim($val);
if (preg_match_all("/\\b".preg_quote($val)."\\b/i", $content, $matches)) {
$found["$val"] = count($matches[0]);
unset($matches);
} else {
$found["$val"] = 0;
}
}
return $found;
} else {
if (preg_match_all("/\\b".preg_quote($word)."\\b/i", $content, $matches)) {
return count($matches[0]);
} else {
return 0;
}
}
}
/**
* Search and replace
*
* Search html file for 1st argument, replace with 2nd argument
*
* @param mixed $search string or array containing strings to find
* @param mixed $replace string or array containing replacement string
*/
function search_replace($search, $replace)
{
$this->html = str_replace($search, $replace, $this->html);
return true;
}
/**
* Process html
*/
function process()
{
if ($this->use_base_href &&
preg_match("!<base.+?href\\s*=\\s*[\"']?(http://[^\"'>]+?)[\"']?.*?".">!i", $this->html, $matches)) {
$this->set_base_url(trim($matches[1]));
}
if ($this->rewrite_url) {
$this->html = preg_replace_callback('/<(a|area|img|link|frame|iframe|input)(.*?)(src|href)(\s?=\s?)("|\')?([^> \'"]+)/is',
array($this, 'rewrite_url'), $this->html);
}
}
/**
* Rewrite URL
*/
function rewrite_url(&$modify)
{
$url = trim($modify[6]);
$pre = "<".$modify[1].$modify[2].$modify[3].$modify[4].$modify[5];
$post = "";
$ret = $url;
if ($url_split = @parse_url($url)) {
// if mailto link, return unchanged
if (strtolower(substr($url, 0, 7)) == "mailto:") {
return $modify[0];
}
// if scheme included
if (isset($url_split["scheme"])) {
if (strtolower($url_split["scheme"]) == "http") {
$ret = $url;
if (in_array(strtolower($modify[1]), array("a","frame","area","iframe")))
$ret = $this->add_passthrough($ret);
return $pre.$ret.$post;
} else {
// return unchanged
return $modify[0];
}
}
// if fragment only (#??)
if (substr($url, 0, 1) == "#") {
// return unchanged
return $modify[0];
}
// if scheme not included (relative)
if (!empty($this->base_url)) {
if (isset($url_split["path"])) {
if (substr($url_split["path"], 0, 1) == "/") {
$base_split = @parse_url($this->base_url);
$ret = "http://".$base_split["host"].(isset($base_split["port"]) ? ":".$base_split["port"] : "").$url_split["path"];
$ret .= (!empty($url_split["query"]) ? "?".$url_split["query"] : "");
if (in_array(strtolower($modify[1]), array("a","frame","area","iframe")))
$ret = $this->add_passthrough($ret);
return $pre.$ret.(isset($url_split["fragment"]) ? "#".$url_split["fragment"] : "").$post;
} else {
$ret = $this->base_url.$url_split["path"].(!empty($url_split["query"]) ? "?".$url_split["query"] : "");
if (in_array(strtolower($modify[1]), array("a","frame","area","iframe")))
$ret = $this->add_passthrough($ret);
return $pre.$ret.(isset($url_split["fragment"]) ? "#".$url_split["fragment"] : "").$post;
}
}
}
}
// return unchanged
return $modify[0];
}
/**
* Add passthrough
*/
function add_passthrough($url)
{
if (!empty($this->pass_through)) {
return $this->pass_through.urlencode(trim($url));
} else {
return $url;
}
}
/**
* Add base tag
*/
function add_base($url)
{
$this->html = preg_replace('/<base[^>]+>/i', '', $this->html);
$this->html = preg_replace('/<html[^>]*>/i', '<html><base href="'.$url.'" />', $this->html, 1);
return true;
}
}
?> |