<?
require 'class.Html.php';
$url = 'http://www.yahoo.com/';
$objHtmlParser = new Html($url);
$description = '';
$objHtmlParser->Clean();
//here html page will be parsed into tree structure
//tree will be saved at $objHtmlParser->tree variable
//but this tree is not useful still
$objHtmlParser->Parse(&$description);
//taking all needed tags from this tree
//will return us more useful list of tags
$all_tags = array();
$objHtmlParser->FindAllTags($objHtmlParser->tree,&$all_tags);
//collect phrases with 1,2,3 or 4 keywords
//add more if you need analyze long phrases
$stat_prepare = array(1,2,3,4);
// here how we can use $all_tags array now
$charset = $all_tags['meta content-type'][0]['props']['charset'];
//
/**
* this is keyword density functions
* it will return sorted array with all keyword phrases
* this will return us array with following structure:
* Array(
* [1]=>Array( - phrases with 1 keyword
* [yahoo]=>Array( - tags where phrase was found
* [title]=>Array(
* [count]=>1 - how much repeats current phrase in this tag
* [text]=>Yahoo!
* )
* [__total__] => 20, - total repeats in whole page
* .................................
* )
* .........................
* ),
* [2] => Array( - phrases with 2 keywords
* [real estate] => Array
( - tags where phrase was found
[a] => Array
(
[count] => 2
[text] => Real Estate; Yahoo! Real Estate -
)
[__total__] => 2,
............................
),
........................
* )
*
*/
$words_stat = $objHtmlParser->KeywordsDensity($all_tags,$stat_prepare);
// $words_stat having not needed phrases still.
// lets delete phrases which not repeated less than 2 times
$words_top_notsorted = $objHtmlParser->deleteMinEntries($words_stat,2);
// lets sort phrases by __total__(total repeats in page) value
foreach ($words_stat as $key=>$val){
$total = $words_stat[$key]['__total__'];
$words_stat[$key] = $objHtmlParser->SortWordsSataistic($words_stat[$key],2);
$words_stat[$key]['__total__'] = $total;
}
$charset_to = "UTF-8";
// here we have good sorted words
print_r($words_stat);
//converting encode if was used some chines or any other chracters
$words_stat = $objHtmlParser->ConvertEncoding($words_stat,$charset,$charset_to);
$words_top_notsorted = $objHtmlParser->ConvertEncoding($words_top_notsorted,$charset,$charset_to);
// here is example how to take another data from html
$title = $objHtmlParser->helpConvertEncoding($all_tags['title'][0]['text'],$charset,$charset_to);
$keywords = $objHtmlParser->helpConvertEncoding($all_tags['meta keywords'][0]['text'],$charset,$charset_to);
$desc = $objHtmlParser->helpConvertEncoding($all_tags['meta description'][0]['text'],$charset,$charset_to);
$original_text = $objHtmlParser->helpConvertEncoding($objHtmlParser->original_text,$charset,$charset_to);
?>
|