<?php
/**
* Class for getting general informations about html content
* @author Sven Wagener <wagener_at_indot_dot_de>
* @include Funktion:_include_
*/
class html_info{
var $string="";
var $meta="";
/**
* Constructor of class html_info
* @param string $html_string The whole HTML document as String
* @desc Constructor of class html_info
*/
function html_info($html_string){
$this->string=$html_string;
}
/**
* Returns the title
* @return string $title the title of the HTML document
* @desc Constructor of class html_info
*/
function get_title(){
$string=strtolower($this->string);
preg_match_all("|<title>(.*)</title>|U",$string,$matches, PREG_PATTERN_ORDER);
return $matches[1][0];
}
/**
* Returns the meta data
* @return array $matches the title of the HTML document
* @desc Returns the meta data of the HTML document in an array ($matches[$i]['name'] and $matches[$i]['content'])
*/
function get_meta_data(){
$string=strtolower($this->string);
preg_match_all("|<meta (.*)>|U",$string,$matches, PREG_PATTERN_ORDER);
$k=0;
$tmp_match_array="";
// Putting all matches in an array
for($i=0;$i<count($matches);$i++){
for($j=0;$j<count($matches[$i]);$j++){
if($matches[$i][$j]!=""){
$tmp_match_array[$k]=$matches[$i][$j];
$k++;
}
}
}
$matches="";
// Getting detailed information of meta data and putting in array
$k=0;
for($i=0;$i<count($tmp_match_array);$i++){
// Getting name
preg_match_all("|name\=\"(.*)\" |U",$tmp_match_array[$i],$name_matches, PREG_PATTERN_ORDER);
// Checking if entry not exists
$found=false;
for($j=0;$j<count($matches);$j++){
if($matches[$j]['name']==$name_matches[1][0]){
$found=true;
}
}
if(!$found && $name_matches[1][0]!=""){
$matches[$k]['name']=$name_matches[1][0];
// Getting content
preg_match_all("|content\=\"(.*)\"|U",$tmp_match_array[$i],$content_matches, PREG_PATTERN_ORDER);
$matches[$k]['content']=$content_matches[1][0];
$k++;
}
}
$this->meta=$matches;
return $matches;
}
/**
* Returns all images
* @return array $match the pictures and all information in an array
* @desc Returns all images in an array ($match[$i]['src'], $match[$i]['alt'], $match[$i]['width'] and $match[$i]['height'])
*/
function get_images(){
$string=strtolower($this->string);
preg_match_all("|<img (.*)>|U",$string,$matches, PREG_PATTERN_ORDER);
// Putting all matches in an array
for($i=0;$i<count($matches);$i++){
for($j=0;$j<count($matches[$i]);$j++){
if($matches[$i][$j]!=""){
$tmp_match_array[$k]=$matches[$i][$j];
$k++;
}
}
}
$k=0;
for($i=0;$i<count($tmp_match_array);$i++){
$found=false;
for($j=0;$j<count($match);$j++){
if($this->get_tag_param("src",$tmp_match_array[$i])==$match[$j]['src']){
$found=true;
}
}
if(!$found && $this->get_tag_param("src",$tmp_match_array[$i])!=""){
$match[$k]['src']=$this->get_tag_param("src",$tmp_match_array[$i]);
$match[$k]['alt']=$this->get_tag_param("alt",$tmp_match_array[$i]);
$match[$k]['width']=$this->get_tag_param("width",$tmp_match_array[$i]);
$match[$k]['height']=$this->get_tag_param("height",$tmp_match_array[$i]);
$k++;
}
}
return $match;
}
/**
* Returns all links
* @return array $match the links and all information in an array
* @desc Returns all links in an array ($match[$i]['href'] and $match[$i]['target'])
*/
function get_links(){
$string=strtolower($this->string);
preg_match_all("|<a (.*)>|U",$string,$matches, PREG_PATTERN_ORDER);
// Putting all matches in an array
for($i=0;$i<count($matches);$i++){
for($j=0;$j<count($matches[$i]);$j++){
if($matches[$i][$j]!=""){
$tmp_match_array[$k]=$matches[$i][$j];
// echo $tmp_match_array[$k]."<br>\n";
$k++;
}
}
}
$k=0;
for($i=0;$i<count($tmp_match_array);$i++){
$found=false;
for($j=0;$j<count($match);$j++){
if($this->get_tag_param("href",$tmp_match_array[$i])==$match[$j]['href']){
$found=true;
}
}
if(!$found && $this->get_tag_param("href",$tmp_match_array[$i])!=""){
$match[$k]['href']=$this->get_tag_param("href",$tmp_match_array[$i]);
$match[$k]['target']=$this->get_tag_param("target",$tmp_match_array[$i]);
$k++;
}
}
return $match;
}
/**
* Returns all strings which are formated like the given parameter
* @param boolean $bold if string have to be formatted bold choose true
* @param boolean $italic if string have to be formatted italic choose true
* @param boolean $underlined if string have to be formatted underlined choose true
* @return array $strings the strings which have been found in an array
* @desc Returns all strings in an array which are formated like the given parameter
*/
function get_strings_formated($bold,$italic,$underlined){
$i=0;
if($bold){
$tags[$i]['open']="<b>";
$tags[$i]['close']="</b>";
$i++;
}
if($italic){
$tags[$i]['open']="<i>";
$tags[$i]['close']="</i>";
$i++;
}
if($underlined){
$tags[$i]['open']="<u>";
$tags[$i]['close']="</u>";
$i++;
}
$strings=$this->get_strings_in_tags($tags,$this->string);
return $strings;
}
/**
* Returns all strings in $string which are given to the parameter $tags
* @param array $tags the tags in an array ($tags[$i]['open'] and $tags[$i]['close'])
* @param string $string the HTML string
* @return array $strings the strings which have been found in an array
* @desc Returns all strings in $string which are given to the parameter $tags
*/
function get_strings_in_tags($tags,$string){
for($i=0;$i<count($tags);$i++){
$k=0;
$pattern="|".$tags[$i]['open']."(.*)".$tags[$i]['close']."|U";
preg_match_all($pattern,$string,$matches, PREG_PATTERN_ORDER);
// Getting rest of all Tags
for($j=0;$j<count($tags);$j++){
if($tags[$j]['open']!=$tags[$i]['open'] && $tags[$j]['close']!=$tags[$i]['close']){
$new_tags[$k]=$tags[$j];
$k++;
}
}
// Getting Strings from all matches
for($j=0;$j<count($matches[1]);$j++){
$new_string=$matches[1][$j];
}
if(count($tags)==1){
for($j=0;$j<count($matches[1]);$j++){
$end_matches[$j]=strip_tags($matches[1][$j]);
}
return $end_matches;
}else{
for($j=0;$j<count($matches[1]);$j++){
$new_string=$matches[1][$j];
$end_matches=array_merge($this->get_strings_in_tags($new_tags,$new_string),$end_matches);
}
}
}
return $end_matches;
}
/**
* Returns all strings in $string which are between the start and end tag
* @param string $start_tag the starting tag
* @param string $end_tag the end tag
* @param string $string the string to search for
* @return array $strings the strings which have been found pusched in an array
* @desc Returns all strings in $string which are between the start and end tag
*/
function get_strings_in_tag($start_tag,$end_tag,$string){
$pattern="|".$start_tag."(.*)".$end_tag."|U";
preg_match_all($pattern,$string,$matches, PREG_PATTERN_ORDER);
for($j=0;$j<count($matches[1]);$j++){
$array[$j]=$matches[1][$j];
}
return $array;
}
/**
* Returns all strings which are headed (<h1> ... </h1> etc)
* @param int $from_headnumber
* @param int $till_headnumber
* @return array $strings the strings which have been found pusched in an array
* @desc Returns all strings which are headed (<h1> ... </h1> etc)
*/
function get_strings_headed($from_headnumber,$till_headnumber){
$count_headers=$till_headnumber-$from_headnumber;
$result_arr=array();
for($i=$from_headnumber;$i<=$till_headnumber;$i++){
$results=$this->get_strings_in_tag("<h$i>","</h$i>",$this->string);
if($results!=""){
$result_arr=array_merge($result_arr,$results);
}
}
return $result_arr;
}
/**
* Returns the content of the body
* @return string $bodytext The content of the body
* @desc Returns the content of the body
*/
function get_body(){
// Getting body parametres
$pattern="|<body(.*)>|U";
preg_match_all($pattern,$string,$matches, PREG_PATTERN_ORDER);
// Deleting body parameters
$string=str_replace($matches[1][0],"",$string);
echo "<xmp>".$string."</xmp>";
$pattern="|<body>(.*)</body>|U";
// Getting text in body
$matches="";
preg_match_all($pattern,$string,$matches, PREG_SET_ORDER);
$string=$matches;
for($i=0;$i<count($string);$i++){
for($j=0;$j<count($string[$i]);$j++){
echo "\$string[$i][$j]".$string[$i][$j]."<br>";
}
}
}
/**
* Returns the content of the body without tags
* @return string $bodytext the content of the body without tags
* @desc Returns the content of the body without tags
*/
function get_body_text(){
$string=$this->string;
$string=strip_tags($string);
$string=str_replace("\n","",$string);
$string=str_replace("\r","",$string);
$string=str_replace("\t","",$string);
$string=str_replace("<!--","",$string);
$string=str_replace("//-->","",$string);
$string=str_replace(" ","",$string);
return $string;
}
/**
* Returns the frame urls
* @return array $frame_urls the urls of the frame in an array
* @desc Returns the frame urls
*/
function get_frame_urls(){
}
function get_tag_param($param,$tag){
preg_match_all("|$param\=\"(.*)\"|U",$tag,$matches, PREG_PATTERN_ORDER);
if($matches[1][0]==""){
preg_match_all("|$param\=(.*)|U",$tag,$matches, PREG_PATTERN_ORDER);
}
if($matches[1][0]==""){
preg_match_all("|$param\=\'(.*)\'|U",$tag,$matches, PREG_PATTERN_ORDER);
}
return $matches[1][0];
}
}
?>
|