Login   Register  
PHP Classes
elePHPant
Icontem

File: simple_crawler_example.php

Recommend this page to a friend!
Stumble It! Stumble It! Bookmark in del.icio.us Bookmark in del.icio.us
  Classes of Jacek Lukasiewicz  >  Simple Page Crawler  >  simple_crawler_example.php  >  Download  
File: simple_crawler_example.php
Role: Example script
Content type: text/plain
Description: example
Class: Simple Page Crawler
Retrieve HTML pages and extract its elements
Author: By
Last change:
Date: 2011-03-11 01:48
Size: 1,382 bytes
 

Contents

Class file image Download
<?php 
/**
 * Example using of SimpleCrawler class library
 */

require 'simple_crawler.classes.php';

$reader = new HtmlReader();

$page 'http://falsztyn.boo.pl';
//$page = 'http://www.phpclasses.org';

//read  content from url
$html $reader->getPageContent($page);

//document content object
$htmlDoc = new HtmlDocument($html);

//document body part object
$body $htmlDoc->getBody();

//objects array of page links 
$links $body->grabLinks();

//clean text version of document body object
$cleanBody $body->getStrippedBody();

//counted words from cleaned document body (word=>count)
$words = new BodyWords();
$pageWords $words->findWords($cleanBody->getContent());
$words->appendWords($pageWords);


//follow front page links with recursive=1 
foreach($links as $link) {
    if(
$link->url == '/') continue;
    if(
$link->type == 1) {
        
$pageLink $page.$link->url;
    } else {
        continue; 
//no follow external links
        //$pageLink = $link->url;
    
}
    
$html $reader->getPageContent($pageLink);
    
    
$htmlDoc = new HtmlDocument($html);
    
$body $htmlDoc->getBody();
    
$cleanBody $body->getStrippedBody();
    
    
$pageWords $words->findWords($cleanBody->getContent());
    
$words->appendWords($pageWords$link->url);
}

//display words:count per page
print_r($words->getWords());
//here you may do something with this words

?>