File: abbr/getOrgAcro.php

Recommend this page to a friend!
abbr/getOrgAcro.php
File:	`abbr/getOrgAcro.php`
Role:	Auxiliary script
Content type:	`text/plain`
Description:	Auxiliary script
Class:	Sweeper Clean HTML to remove unwanted tags and attributes
Author:	By Jill Lingoff
Last change:
Date:	6 years ago
Size:	`5,768 bytes`
Download

<?php



// this was used to get canadian government organization names and their acronyms

// as well as contact information using the found acronyms from government electronic directory services.



$OrganizationListPage = file_get_contents("http://direct.srv.gc.ca/cgi-bin/direct500/");

$OrganizationArray = findOrgsInURLs($OrganizationListPage);

$OrganizationNameArray = findOrgNamesInURLs($OrganizationListPage);



foreach($OrganizationArray as $index => $acro) {

    preg_match("/([^\-]*)-/", $acro, $matches);

    $match = $matches[1];

    $ToWrite .= $match . "    " . $OrganizationNameArray[$index] . "\r\n";    

}

$file = "eng/GOC/acronyms.txt";

$fp = fopen($file, 'w');

fwrite($fp, $ToWrite);

fclose($fp);



$FraOrganizationListPage = file_get_contents("http://direct.srv.gc.ca/cgi-bin/direct500/XFo%3dGC%2cc%3dCA");

$FraOrganizationArray = FrafindOrgsInURLs($FraOrganizationListPage);

$FraOrganizationNameArray = FrafindOrgNamesInURLs($FraOrganizationListPage);



foreach($FraOrganizationArray as $Fraindex => $Fraacro) {

    preg_match("/-([^\-]*)/", $Fraacro, $Framatches);

    $Framatch = $Framatches[1];

    $FraToWrite .= $Framatch . "    " . $FraOrganizationNameArray[$Fraindex] . "\r\n";    

}

$Frafile = "fra/GDC/acronyms.txt";

$Frafp = fopen($Frafile, 'w');

fwrite($Frafp, $FraToWrite);

fclose($Frafp);



exit(0);



// from here we are getting contact information

$URLArray = array();

foreach($OrganizationArray as $OrgAcro) {

    $PageWithPeople = file_get_contents("http://direct.srv.gc.ca/cgi-bin/direct500/SEou%3d$OrgAcro%2co%3dGC%2cc%3dCA?SV=web&SF=Title&ST=contains&x=31&y=20");

    $URLArray = array_merge($URLArray, findURLs($PageWithPeople));

}



$rxpArray = array(

    // Name

    '/(<h2>)([\s]{0,10})([^\r\n]*)([\s]{0,10})(<a)/is' => '$3',

    // Title

    '/(<!-- Display detailed information -->)([\s]{0,10})(<div class="*text"*>)([\s]{0,10})([^<]*)(<br>)(<br>)([\s]{0,15})(<!-- title of person -->)/is' => '$5',

    // Organization 1

    '/(<!-- title of person -->)([\s]{0,10})([\w ,\-\(\)�]*)([\s]{0,10})(<br>)([\s]{0,10})(<!-- top level OU -->)/is' => '${3}',

    // Organization 2

    '/(<!-- top level OU -->)([\s]{0,10})([\w ,\-\(\)�]*)([\s]{0,10})*(<br>)([\s]{0,10})(<!-- immediate OU -->)/is' => '$3',

    // Address

    '/(<!-- Address - PO Box - Mail stop - City - Province - Contry - Postal code -->)([\s]{0,10})(<br>)([^<]*)(<br>)/is' => '$4',

    // City, Province

    '/(ITEM=\[\]\$-->)([\s]{0,10})(<br>)([\w ,\-\(\)�]*)([\s]{0,10})(<br>)/is' => '$4',

    // Country

    '/(<br>)([\w]*)(<br>)([\s]{0,10})([\w \-\(\)�]*)([\s]{0,10})(<!-- Telephone - Alternate telephone - Secure telephone - Fax - Secure Fax - TDD -->)/is' => '$2',

    // Postal Code

    '/(<br>)([\s]{0,10})([\w ]*)([\s]{0,10})(<!-- Telephone - Alternate telephone - Secure telephone - Fax - Secure Fax - TDD -->)/is' => '$3',

    // Telephone 1

    '/(<!-- Telephone - Alternate telephone - Secure telephone - Fax - Secure Fax - TDD -->)([\s]{0,10})(<dl>)([\s]{0,10})(<dt>Telephone:<\/dt><dd>)([^<]*)(<\/dd>)/is' => '$6',

    // Telephone 2

    '/(<dd>)([\w\(\)\- ]*)(<\/dd>)([\s]{0,10})(<dt>Fax:<\/dt><dd>)([^<]*)(<\/dd>)([\s]{0,10})(<\/dl>)([\s]{0,10})(<!-- X400 address -->)/is' => '$2',        

    // Fax

    '/(<dt>Fax:<\/dt><dd>)([^<]*)(<\/dd>)([\s]{0,10})(<\/dl>)([\s]{0,10})(<!-- X400 address -->)/is' => '$2',

    );

    

// these spaces are tabs (since we want a tab-separated spreadsheet out of this).

$record = ("Name" . "    ");

$record .= ("Title" . "    ");

$record .= ("Organization 1" . "    ");

$record .= ("Organization 2" . "    ");

$record .= ("Address" . "    ");

$record .= ("City, Province" . "    ");

$record .= ("Country" . "    ");

$record .= ("Postal Code" . "    ");

$record .= ("Telephone 1" . "    ");

$record .= ("Telephone 2" . "    ");

$record .= ("Fax" . "\r\n");



foreach ($URLArray as $file) {

    $fileContents = file_get_contents($file);

    $WhatToAdd = FindStuff($fileContents, $rxpArray);

    if ("                                            \r\n" != $WhatToAdd) {

        $record .= $WhatToAdd;

    }

}



WriteFile("GEDS-record.txt", $record);



function WriteFile($strTargetx, $tpx) {

    // permission must be modified so that this file can be written to.

    $handle = fopen($strTargetx, 'w');

    fwrite($handle, $tpx);

    fclose($handle);

}



function FindStuff($strToFindOn, $rxpArray) {

    $newRecordToAppend = "";

    foreach ($rxpArray as $rxp => $replacement) {

        preg_match($rxp, $strToFindOn, $matches);

        // this space is a tab (since we want a tab-separated spreadsheet out of this).

        $newRecordToAppend .= (ReplaceStuff($matches[0], $rxp, $replacement) . "    ");

    }

    $newRecordToAppend .= "\r\n";

    return($newRecordToAppend);

}



function FindURLs($strToFindOn) {

    preg_match_all("/<li><a\shref=\"([^\"]*)\"/is", $strToFindOn, $matches, PREG_PATTERN_ORDER);

    return($matches[1]);

}



function FindOrgsInURLs($strToFindOn) {

    preg_match_all("/<li><a\shref=\"http:\/\/direct\.srv\.gc\.ca\/cgi\-bin\/direct500\/XEou%3d([^\"]*)%2co%3dGC%2cc%3dCA\"/is", $strToFindOn, $matches, PREG_PATTERN_ORDER);

    return($matches[1]);

}



function FraFindOrgsInURLs($strToFindOn) {

    preg_match_all("/<li><a\shref=\"http:\/\/direct\.srv\.gc\.ca\/cgi\-bin\/direct500\/XFou%3d([^\"]*)%2co%3dGC%2cc%3dCA\"/is", $strToFindOn, $matches, PREG_PATTERN_ORDER);

    return($matches[1]);

}



function FindOrgNamesInURLs($strToFindOn) {

    preg_match_all("/<li><a\shref=\"http:\/\/direct\.srv\.gc\.ca\/cgi\-bin\/direct500\/XEou%3d([^\"]*)%2co%3dGC%2cc%3dCA\">([^<]*)/is", $strToFindOn, $matches, PREG_PATTERN_ORDER);

    return($matches[2]);

}



function FraFindOrgNamesInURLs($strToFindOn) {

    preg_match_all("/<li><a\shref=\"http:\/\/direct\.srv\.gc\.ca\/cgi\-bin\/direct500\/XFou%3d([^\"]*)%2co%3dGC%2cc%3dCA\">([^<]*)/is", $strToFindOn, $matches, PREG_PATTERN_ORDER);

    return($matches[2]);

}



function ReplaceStuff($strToReplaceOn, $rxp, $replacement) {

    $strToReplaceOn = preg_replace($rxp, $replacement, $strToReplaceOn);

    return($strToReplaceOn);

}



?>
About us
Advertise on this site
For more information send a message to info at phpclasses dot org.