PHP Classes

File: upperclass_spans.php

Recommend this page to a friend!
  Classes of Jill Lingoff   Sweeper   upperclass_spans.php   Download  
File: upperclass_spans.php
Role: Auxiliary script
Content type: text/plain
Description: Auxiliary script
Class: Sweeper
Clean HTML to remove unwanted tags and attributes
Author: By
Last change:
Date: 5 years ago
Size: 3,071 bytes
 

Contents

Class file image Download
<?php

// should be good enough for names in fernch bibliographies although it doesn't do stop words
// ex. AUSTRALIAN GOVERNMENT DEPARTMENT OF HEALTH AND AGEING => Australian Government Department Of Health And Ageing

$file = 'C:\wamp\www\sweeper\not-swept\im-014-fr.html';
$contents = file_get_contents($file);

preg_match_all('/<span style="text-transform:uppercase">(.*?)<\/span>/is', $contents, $upperclass_span_matches, PREG_OFFSET_CAPTURE);
$counter = sizeof($upperclass_span_matches[0]) - 1;
print(
'<table>');
while(
$counter > -1) {
   
$span_content = $upperclass_span_matches[1][$counter][0];
   
$span_offset = $upperclass_span_matches[0][$counter][1];
    print(
'<tr>
<th align="left">'
. $span_content . '</th>
<td>'
);
   
$counter2 = 0;
   
$parsing_word = false;
   
$parsing_characer_entity = false;
   
$parsing_mac = false;
   
$possibly_parsing_mac = false;
   
$new_span_content = '';
    while(
$counter2 < strlen($span_content)) {
        if(
$parsing_characer_entity) {
            if(
$span_content[$counter2] === ';') {
               
$parsing_characer_entity = false;
            }
           
$new_span_content .= strtolower($span_content[$counter2]); // notice that the intention is for character entities to also be converted to lower class by this
       
} else {
            if(
$span_content[$counter2] === '&') {
               
$parsing_characer_entity = true;
               
$new_span_content .= $span_content[$counter2];
            } else {
                 if(!
$parsing_word) {
                    if(
$span_content[$counter2] === 'M' || $span_content[$counter2] === 'm') {
                       
$possibly_parsing_mac = true;
                       
$parsing_word = true;
                       
$new_span_content .= strtoupper($span_content[$counter2]);
                    } elseif(
preg_match('/[A-Z]/is', $span_content[$counter2])) {
                       
$parsing_word = true;
                       
$new_span_content .= strtoupper($span_content[$counter2]);
                    } else {
                       
$new_span_content .= strtolower($span_content[$counter2]);
                    }
                } else {
                    if(
$possibly_parsing_mac) {
                        if(
$span_content[$counter2] === 'C' || $span_content[$counter2] === 'c') {
                           
$parsing_mac = true;
                        } elseif(!
preg_match('/[A-Z]/is', $span_content[$counter2])) {
                           
$parsing_word = false;
                        }
                       
$new_span_content .= strtolower($span_content[$counter2]);
                       
$possibly_parsing_mac = false;
                    } elseif(
$parsing_mac) {
                       
$new_span_content .= strtoupper($span_content[$counter2]);
                       
$parsing_mac = false;
                    } else {
                        if(
$span_content[$counter2] === 'C' || $span_content[$counter2] === 'c') {
                            if(
$possibly_parsing_mac) {
                               
$parsing_mac = true;
                               
$possibly_parsing_mac = false;
                            }
                        } elseif(!
preg_match('/[A-Z]/is', $span_content[$counter2])) {
                           
$parsing_word = false;
                        }
                       
$new_span_content .= strtolower($span_content[$counter2]);
                    }
                }
            }
        }
       
$counter2++;
    }
    print(
$new_span_content . '</td>
</tr>
'
);
   
$contents = substr($contents, 0, $span_offset) . '<span style="text-transform:uppercase">' . $new_span_content . '</span>' . substr($contents, $span_offset + strlen($upperclass_span_matches[0][$counter][0]));
   
$counter--;
}
print(
'</table>');

file_put_contents($file, $contents);

?>