PHP Classes

File: class.htmlparser.php

Recommend this page to a friend!
  Classes of Lin Zhemin   HTMLparser   class.htmlparser.php   Download  
File: class.htmlparser.php
Role: ???
Content type: text/plain
Description: The class itself
Class: HTMLparser
Author: By
Last change:
Date: 23 years ago
Size: 16,718 bytes
 

Contents

Class file image Download
<? // Copyright(C)MMI, Lin Zhemin. // PHP HTML parser class // Copyright declared according to GPLv2. // // GeOu{AhC // GTp rowspan (vTj) // G|B DTD, , CSS, PHP, ASP $DEBUGLEVEL = 0; // T $SHOWINDENT = 4; // Y // HU http://www.w3.org/TR/html4/index/elements.html // HTML n end tag (bWW End Tag = ) $HTMLbrac = array( 'a', 'abbr', 'acronym', 'address', 'applet', 'b', 'bdo', 'big', 'blockquote', 'button', 'caption', 'center', 'cite', 'code', 'del', 'dfn', 'dir', 'div', 'dl', 'em', 'fieldset', 'font', 'form', 'frameset', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'i', 'iframe', 'ins', 'kbd', 'label', 'legend', 'map', 'menu', 'noframes', 'noscript', 'object', 'ol', 'optgroup', 'pre', 'q', 's', 'samp', 'script', 'select', 'small', 'span', 'strike', 'strong', ' style', 'sub', 'sup', 'table', 'textarea', 'title', 'tt', 'u', 'ul', 'var', 'html', 'body', 'head' // oOvXRI ); // HTML ]tO (WGEnd Tag = O) // @w]tO $HTMLcont = array_merge($HTMLbrac, array( 'colgroup', 'dd', 'dt', 'li', 'option', 'p', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr' )); // HTML L]tO (WGEnd Tag = F, Empty = E) $HTMLnocn = array( 'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', 'img', 'input', 'isindex', 'link', 'meta', 'param'); // NH $HTMLCloseTag = array( 'p' => array('table') ); function HTMLdebug($l, $s) { global $DEBUGLEVEL; if($l > $DEBUGLEVEL) return; echo "DEBUG: $s\n"; } // Container class Container { var $obj = array(); var $iden; var $cnt = 0; // obj function push(&$o) { HTMLdebug(5, ' Container::push'); $this->cnt++; return array_push($this->obj, $o); } function pop() { return array_pop($this->obj); } function show($in = 0) { HTMLdebug(7, " Container::show($in)"); $r = ""; for($i = 0; $i < $this->cnt; $i++) { $r .= $i.'C'.$this->obj[$i]->show($in); } return $r; } function out() { HTMLdebug(7, ' Container::out'); $r[] = $this; for($i = 0; $i < $this->cnt; $i++) { $r[] = $this->obj[$i]->out(); } return $r; } function type() { return "Container"; } } class HTMLtxt { var $txt; function HTMLtxt(&$s) { HTMLdebug(5, " HTMLtxt::HTMLtxt"); HTMLdebug(4, "HTMLtxt = $s"); $this->txt = trim($s); } function show($in = 0) { HTMLdebug(7, " HTMLtxt::show($in)"); return 'r'.str_repeat(' ', $in).$this->txt."\n"; } function out() { HTMLdebug(7, ' HTMLtxt::out'); return $this->txt; } function iden() { return "HTMLtxt"; } function type() { return "HTMLtxt"; } } // HTML Tag class HTMLattr { var $element; var $property; function HTMLattr($e, $p = "") { HTMLdebug(5, " HTMLattr::HTMLattr"); $this->element = trim($e); $this->property = trim($p); } function show($in = 0) { HTMLdebug(7, " HTMLattr::show($in)"); return ''.str_repeat(' ', $in).$this->element.' = '.$this->property ."\n"; } function out() { HTMLdebug(7, ' HTMLattr::out'); return $this; } function type() { return "HTMLattr"; } function iden() { return "HTMLattr"; } } // HTML Tag class class HTMLtag { var $tag; function HTMLtag($t) { HTMLdebug(5, " HTMLtag::HTMLtag"); if(!preg_match('/(\/?\w+)(.*)/', $t, $a)) { die('_ tag'); } $this->tag = strtolower($a[1]); if(count($a) > 2) { preg_match_all('/([\w-]+)(=\s*"?([^"]*)"?)?/', $a[2], $b); for($i = 0; $i < count($b[0]); $i++) { $attr = new HTMLattr($b[1][$i], $b[3][$i]); $this->push($attr); } } } function push(&$attr) { HTMLdebug(5, ' HTMLtag::push'); $this->cnt++; array_push($this->attr, $attr); } if($this->attr[$i]->element == $a) { return $this->attr[$i]->property; } } return NULL; } function show($in = 0) { global $SHOWINDENT; HTMLdebug(7, " HTMLtag::show($in)"); $r = ''.str_repeat(' ', $in).'['.$this->tag."]\n"; for($i = 0; $i < $this->cnt; $i++) { $r .= $i.'T'.$this->attr[$i]->show($in + $SHOWINDENT); } return $r; } function out() { HTMLdebug(7, ' HTMLtag::out'); $r[] = $this; for($i = 0; $i < $this->cnt; $i++) { $r[] = $this->attr[$i]->out(); } return $r; } function iden() { return $this->tag; } function type() { return "HTMLtag"; } } // HTML para: hAtdh| container // G // $sup Whr // $s nBzry // $t yz // $this->iden er // $this->obj[0] eyz // $this->obj[1..] subnodes // $this->sup OWhr class HTMLpara extends Container { var $iden; var $sup; function HTMLpara($sup, $s, $t = "") { HTMLdebug(5, " HTMLpara::HTMLpara, Wh [$sup]"); $this->sup = $sup; if(is_object($t)) { $this->push($t); } if($s) { if(eregi('^<table', $s)) { $this->fixtable($s); } $this->parse(&$s); } } function iden() { HTMLdebug(5, ' HTMLpara::iden'); return $this->iden; } function push(&$o) { HTMLdebug(5, ' HTMLpara::push'); if(!isset($this->iden)) { $this->iden = $o->iden(); HTMLdebug(3, 'eO ['.$this->iden.']'); } if($o->type() == "HTMLpara") { HTMLdebug(3, 'e ['.$this->iden.'] e ['.$o->iden().']'); } else { HTMLdebug(3, 'e ['.$this->iden.'] ['.$o->iden().']'); } $this->cnt++; array_push($this->obj, $o); } function show($in = 0) { global $SHOWINDENT; HTMLdebug(7, " HTMLpara::show($in)"); $r = 'e'.str_repeat(' ', $in).'{'.$this->iden."}\n"; $r .= Container::show($in + $SHOWINDENT); return $r; } function out() { HTMLdebug(7, ' HTMLpara::out'); $r[] = $this; for($i = 0; $i < $this->cnt; $i++) { $r[] = $this->obj[$i]->out(); } return $r; } function type() { return "HTMLpara"; } function parse($s) { HTMLdebug(5, ' HTMLpara::parse'); while($s) { $this->parsetxt($s); $this->parsetag($s); } HTMLdebug(3, "[".$this->iden."] RF"); } // RGr function parsetxt(&$s) { HTMLdebug(6, ' HTMLpara::parsetxt'); $TxtRe = '/^([^<]*)/'; if(preg_match($TxtRe, $s, $r)) { $txt = trim($r[1]); if(!empty($txt)) { $t = new HTMLtxt($txt); HTMLdebug(2, 'e ['.$this->iden.'] F ['.$t->iden().'] '); $this->push($t); $s = trim(substr($s, strlen($r[1]))); HTMLdebug(8, 's = '.$s); } } } // R: function parsetag(&$s) { global $HTMLcont, $HTMLbrac; HTMLdebug(6, ' HTMLpara::parsetag'); $TagRe = '/^<(\/?\w+[^>]*)>/'; if(preg_match($TagRe, $s, $r)) { $tag = trim($r[1]); $t = new HTMLtag($tag); HTMLdebug(2, 'e ['.$this->iden.'] F ['.$t->iden().'] ') ; // S]tOC if(!in_array($t->tag, $HTMLcont)) { HTMLdebug(4, "$t->tag ]tOA~ parse "); $this->parsetagmono($s, $t, $tag); return true; } // Y $bndb = preg_quote(strtolower($tag), '/'); $bnde = explode(" ", $tag); $bnde = strtolower($bnde[0]); $bnde = preg_quote($bnde, '/'); // Bz if(in_array($t->tag, $HTMLbrac)) { HTMLdebug(4, "n [$tag]"); $ns = $this->parsetagbrac($s, $bndb, $bnde); } else { HTMLdebug(4, " [$tag]"); $ns = $this->parsetagunbrac($s, $tag, $bndb, $bnde); } $c = new HTMLpara($this->iden, trim($ns), $t); // SBz if($t->tag == 'table') { $c->parsetable(); } $this->push($c); } } function parsetagmono(&$s, &$t, &$tag) { HTMLdebug(6, ' HTMLpara::parsetagmono'); $s = trim(substr($s, strlen($tag) + 2)); $this->push($t); HTMLdebug(8, 's = '.$s); } function parsetagbrac(&$s, &$bndb, &$bnde) { HTMLdebug(6, ' HTMLpara::parsetagbrac'); $len = $this->parsebalance($s, $bnde); $p = trim(substr($s, 0, $len)); $s = trim(substr($s, $len)); $re = '/(<'.$bndb.'>(.*)(<\/'.$bnde.'[^>]*>))/i'; // v if(preg_match($re, $p, $q)) { HTMLdebug(4, 'e: '.$q[2]); HTMLdebug(8, 's = '.$s); $ns = &$q[2]; } else { HTMLdebug(0, '~Gregex = '.$re); HTMLdebug(0, '~Gp = '.$p); HTMLdebug(0, '~Gs = '.$s); die('io '.__line__); } return $ns; } function parsetagunbrac(&$s, &$tag, &$bndb, &$bnde) { global $HTMLbrac, $HTMLCloseTag; HTMLdebug(6, ' HTMLpara::parsetagunbrac'); HTMLdebug(5, 'J '.$s); HTMLdebug(5, 'tag = '.$tag); $p = substr($s, strlen($tag) + 2); $skip = 0; while($p) { // Dr if(preg_match('/^([^<]+)/i', $p, $q)) { HTMLdebug(7, '[^<]+ = '.$q[1]); $k = strlen($q[1]); $skip += $k; $p = substr($p, $k); } // if(preg_match('/^(<\/?([\w]+)[^>]*>[^<]*)/i', $p, $q)) { $CloseTag = &$q[2]; HTMLdebug(7, 'q[2] = '.$CloseTag); if($CloseTag == $bnde) { break; } $ct = $HTMLCloseTag["$bnde"]; if(is_array($ct) && in_array($CloseTag, $ct)) { break; } } // L_ if(in_array($q[2], $HTMLbrac)) { $len = $this->parsebalance($p, $q[2]); $skip += $len; $p = substr($p, $len); continue; } $k= strlen($q[1]); $p = substr($p, $k); $skip += $k; } $re = '/(<'.$bndb.'>(.{'.$skip.'})(<\/?'.$bnde.'[^>]*>)?)/i'; HTMLdebug(5, 'regex = '.$re); // v if(preg_match($re, $s, $p)) { HTMLdebug(4, "e: $p[2]"); if($p[3] == '</'.$bnde.'>') { // pGF $s = trim(substr($s, strlen($p[1]))); } else { $s = trim(substr($s, strlen($p[1]) - strlen($p[3]))); } HTMLdebug(8, 's = '.$s); $ns = &$p[2]; } else { die('io, line: '.__line__); } return $ns; } // X function parsebalance($s, $bnde) { HTMLdebug(6, ' HTMLpara::parsebalance'); HTMLdebug(5, 'J '.$s.' / '.$bnde); $tmp = $s; $balance = 0; $len = 0; $re = '/^(<\/?(\w+)[^>]*>([^<]*))/i'; HTMLdebug(6, 'regex: '.$re); do { if(!preg_match($re, $tmp, $p)) { HTMLdebug(6, ' regex F'); break; } if(strtolower($p[2]) != $bnde) { $sl = strlen($p[1]); $len += $sl; $tmp = trim(substr($tmp, $sl)); continue; } if($p[1][1] == '/') { $balance--; } else { $balance++; } $sl = strlen($p[1]); $len += $sl; $tmp = trim(substr($tmp, strlen($p[1]))); HTMLdebug(6, 'balance = '.$balance.' len = '.$len.' sl = '.$sl); } while($balance != 0); if($balance) { HTMLdebug(0, 'R~: '.$s); die('Line '.__line__); } $len -= strlen($p[3]); return $len; } // ApC function parsetable() { // `NGSBz rowspan p HTMLdebug(6, ' HTMLpara::parsetable'); $td = array(); $tr = 0; for($i = 1; $i < $this->cnt; $i++) { $o = &$this->obj[$i]; if($o->iden() == 'tr') { $tr++; $ttd = 0; for($j = 1; $j < $o->cnt; $j++) { $p = &$o->obj[$j]; if($p->iden() == 'td' || $p->iden() == 'th') { if(($csp = $p->obj[0]->getattr('colspan')) != NULL) { $ttd += $csp; } else { $ttd++; } } } $td[] = $ttd; } } $max = 0; for($i = 0; $i < count($td); $i++) { if($max < $td[$i]) { HTMLdebug(4, "td[".$i."] = ".$td[$i]); $max = $td[$i]; } } $td = $max; HTMLdebug(3, ' '.$tr.' CA'.$td.''); // oO table I $this->obj[0]->push(new HTMLattr('rows', $tr)); $this->obj[0]->push(new HTMLattr('cols', $td)); } // table function fixtable(&$s) { HTMLdebug(6, ' HTMLpara::fixtable'); HTMLdebug(8, 'JF '.$s); $re = '/^(<(\/?\w+)[^>]*>[^<]*)/i'; $tmp = $s; $tags = array(); $len = 0; while(preg_match($re, $tmp, $p)) { if($p[2] == 'td' || $p[2] == 'th') { break; } $tags[] = $p[2]; $sl = strlen($p[1]); $len += $sl; $tmp = substr($tmp, $sl); } // [J@ <tr> $r = substr($s, 0, $len); if(!in_array('tr', $tags)) { HTMLdebug(6, '[JF <tr> '); $r .= '<tr>'; } $oldlen = $len; while(preg_match($re, $tmp, $p)) { $ot = $tags; $ol = $len; $tags = $p[2]; $len = strlen($p[1]); $tmp = substr($tmp, $len); } // h@ <tr> if($ot == 'tr') { HTMLdebug(5, 'F <tr> '); $r .= substr($s, $oldlen, strlen($s) - $oldlen - $len - $ol) . substr($s, strlen($s) - $len); } else { $r .= substr($s, $oldlen); } $s = trim($r); } } // HTML J class class HTMLin extends Container { function HTMLin($s) { HTMLdebug(5, " HTMLin::HTMLin()"); $s = strtr($s, "\n", ' '); $s = strtr($s, "\r", ''); $s = preg_replace('/<!.*>/U', '', $s); // Bz DTD//CSS $s = preg_replace('/<[?%].*[?%]>/U', '', $s); // Bz PHP/ASP $s = preg_replace('/>\s+</U', '><', $s); // d $s = strchr($s, '<'); // Bz@eF $this->iden = "HTMLin"; $t = new HTMLtag("HTMLin"); $this->push(new HTMLpara($this->iden, $s, $t)); } function show($in = 0) { HTMLdebug(7, " HTMLin::show($in)"); $r = Container::show($in); return $r; } function out() { HTMLdebug(7, ' HTMLin::out'); for($i = 0; $i < $this->cnt; $i++) { $r[] = $this->obj[$i]->out(); } return $r; } function type() { return "HTMLin"; } function iden() { return "HTMLin"; } } ?> function getattr($a) { HTMLdebug(5, ' HTMLtag::getattr'); for($i = 0; $i < $this->cnt; $i++) { var $attr = array(); var $cnt = 0; // attr