PHP Classes

File: scraper.php

Recommend this page to a friend!
  Classes of Jacek Lukasiewicz   Web scraper   scraper.php   Download  
File: scraper.php
Role: Class source
Content type: text/plain
Description: scraper class
Class: Web scraper
Extract information from Web site pages
Author: By
Last change:
Date: 12 years ago
Size: 5,132 bytes
 

Contents

Class file image Download
<?php
include 'lib/phpQuery-onefile.php';
/**
 *
 * This class allows you to get data from any site.
 * The data are taken from defined locations in the DOM structure.
 * Data points are defined using the phpquery notation - similar to the selectors used in JQuery library.
 * This class can fetch data in three different modes by:
 * - scanning a single page
 * - scanning a "from->to" range of pages matching defined URL schema
 * - scanning a list of URLs retrieved from a PHP array
 *
 * @example scrap single page
 * $scrap = new Scraper();
 * $scrap->setBaseUrl('http://page.to.scrap/index.html');
 * $scrap->addDataTarget('title', '#product h1');
 * $data = $scrap->process();
 *
 * @example scrap range of pages
 * $scrap = new Scraper();
 * $scrap->setBaseUrl('http://example.url.com/details.html?id=##TOKEN##');
 * $scrap->addRangeScanRule(151598039, 151598042, '##TOKEN##');
 * $scrap->addDataTarget('name', '#head1 .title h1');
 * $data = $scrap->process();
 *
 * @example scrap list of custom urls
 * $scrap = new Scraper();
 * $myUrls = array('http://site.ccm/ulr1/', 'http://site.ccm/ulr2/', 'http://site.ccm/ulr3/');
 * $scrap->addListScanRule($myUrls);
 * $scrap->addDataTarget('title', '#content .ogloszenie_item h1');
 * $scrap->addDataTarget('image', '#content .ogloszenie_thumb a');
 * $scrap->addDataTarget('price', '#content .ogloszenie_item:contains(\'Cena:\')');
 * $data = $scrap->process();
 *
 * Downloaded data is returned as the array.
 * You can do whatever you want with the data;)
 *
 * @package Scraper
 * @see This class uses phpquery library
 * @link http://code.google.com/p/phpquery/
 *
 * @author JLukasiewicz jlukasie at gmail
 *
 */
class Scraper
{
   
/**
     *
     * base url to range/single -scan
     * @var string
     */
   
private $baseUrl = '';
   
   
/**
     *
     * scan rule
     * Can be 'range' or 'list' type
     * @var array
     */
   
private $scanRule = array();

   
/**
     *
     * Data points in phpquery notation
     * @var array
     */
   
private $dataTargets = array();
   
   
    public function
__construct()
    {
       
    }
   
   
/**
     *
     * baseUrl setter
     * @param string $url
     * @throws Exception
     */
   
public function setBaseUrl($url)
    {
        if (empty(
$url))
        {
            throw new
Exception('Value not specified: url', 1);
        }
       
       
$this->baseUrl = $url;
    }

   
/**
     *
     * scanRule setter
     * @param string $type
     * @param mixed $value
     * @throws Exception
     */
   
private function setScanRule($type, $value)
    {
       
        if (empty(
$type) || empty($value))
        {
            throw new
Exception('Value not specified: type or value', 1);
        }

       
$this->scanRule[$type] = $value;
    }
   
   
   
/**
     *
     * add range rule
     * @param int $min
     * @param int $max
     * @param string $token
     * @throws Exception
     */
   
public function addRangeScanRule($min, $max, $token)
    {
        if (empty(
$min) || empty($max))
        {
            throw new
Exception('Value not specified: min or max', 1);
        }
       
       
$this->setScanRule('range', array('min' => $min, 'max' => $max, 'token' => $token) );
    }
   
   
/**
     *
     * add list scan rule
     * @param array $list
     * @throws Exception
     */
   
public function addListScanRule($list)
    {
        if (empty(
$list) || !is_array($list))
        {
            throw new
Exception('address list is not specified', 1);
        }
       
       
$this->setScanRule('list', $list);
    }
   
   
/**
     *
     * add data point
     * @param string $name
     * @param string $selector
     * @throws Exception
     */
   
public function addDataTarget($name, $selector)
    {
        if (empty(
$name) || empty($selector))
        {
            throw new
Exception('Value not specified: name or selector', 1);
        }
       
       
$this->dataTargets[$name] = $selector;
    }
   
   
/**
     *
     * perform scan
     */
   
public function process()
    {
       
$data = array();
       
       
$urls = $this->getUrlsToScan();
       
        foreach (
$urls as $url)
        {
            if(!(
$input = @file_get_contents($url)))
            {
                continue;
            }
           
phpQuery::newDocumentFileHTML($url);
            unset(
$scrap);
            foreach (
$this->dataTargets as $name => $selector)
            {
               
$scrap[$name] = pq($selector)->html();
            }
            if(!empty(
$scrap))
            {
               
$data[] = $scrap;
            }
        }
       
        return
$data;
    }
   
   
   
/**
     *
     * construct url list to scan
     * @throws Exception
     */
   
private function getUrlsToScan()
    {
   
       
$urls = array();
       
        if (!empty(
$this->scanRule))
        {
            if(!empty(
$this->scanRule['range']))
            {
                if (empty(
$this->baseUrl))
                {
                    throw new
Exception('baseUrl not specified', 2);
                }
                if(!empty(
$this->scanRule['range']['min']) && !empty($this->scanRule['range']['max']))
                {
                    for(
$i = $this->scanRule['range']['min']; $i <= $this->scanRule['range']['max']; $i++)
                    {
                       
$urls[] = str_replace($this->scanRule['range']['token'], $i, $this->baseUrl);
                    }
                }
                else
                {
                    throw new
Exception('scanRule invalid format', 3);
                }
            }
            elseif (!empty(
$this->scanRule['list']))
            {
               
$urls = $this->scanRule['list'];
            }
        }
        else
        {
           
$urls = array($this->baseUrl);
        }
        return
$urls;
    }
}