<?php // classes/demo_UTF8.php
/**
* This script uses class_UTF8 to determine if a string is UTF-8 compatible.
*
* The constructor receives a string and returns an object containing the
* string and a validity indicator. If the string fails UTF-8 validation,
* the offset location of the failures will be provided in an array in the
* "error" property.
*
* The class can also attempt to repair damaged encodings, but the outcome
* of repairs is less certain. PHP converts extended ASCII into UTF-8 by
* putting hex C0 in front of the extended ASCII characters, thus
*
*/
error_reporting(E_ALL);
require_once('class_UTF8.php');
echo '<meta charset="utf-8" />';
echo '<pre>';
// Some UTF-8 test data - both good and bad
$arr =
[ 'ABCDEF'
, '14°F is cold!'
, 'Größe'
, '©'
, chr(0xC3) . chr(0x86) // AE Ligature in UTF-8
, chr(0xE2) . chr(0x82) . chr(0xAC) // Euro in UTF-8
// These are examples of bad UTF-8 because they have code points in 127 < char < 256
, chr(0xC6) . ' AE Ligature'
, 'Accented "a" ' . chr(0xE0) . ' in this string'
, 'Several ' . chr(0x80) . ' Euro ' . chr(0x80) . ' symbols ' . chr(0x80) . ' in ' . chr(0x80) . ' text'
// A UTF-8 nemesis from MSFT Notepad
, chr(0xEF) . chr(0xBB) . chr(0xBF) . 'Thanks for the BOM, Notepad'
// A Bogus character that should not be translated
, 'Bogus 0x81: ' . chr(0x81)
// Anthony Ferrara test data
, chr(0xC0) . chr(0x80) // Overlong encoding of code point 0
, chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80) // Overlong encoding of 5 byte encoding
, chr(0xFC) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80) // Overlong encoding of 6 byte encoding
, chr(0xD0) . chr(0x01) // High code-point without trailing characters
, chr(0x01) . chr(0x01) . chr(0x01) // Actually valid ;-)
];
echo '<h3>Data Not Repaired</h3>';
foreach ($arr as $str) {
hexdump($str);
echo PHP_EOL;
$obj = new UTF8($str);
hexdump($obj->str);
print_r($obj);
echo PHP_EOL;
}
// Some Bad UTF-8 test data that we attempt to repair
$bad =
[ 'AE Ligature at end: ' . chr(0xC6)
, 'Pound at end: ' . chr(0xA3)
, 'The ' . chr(0x80) . ' Euro symbol'
, 'Several ' . chr(0x80) . ' Euro ' . chr(0x80) . ' symbols ' . chr(0x80) . ' in ' . chr(0x80) . ' text'
// A Bogus character that cannot be translated
, 'Bogus 0x81: ' . chr(0x81)
];
echo '<h3>Data Repair Attempted</h3>';
foreach ($bad as $str) {
hexdump($str);
echo PHP_EOL;
$obj = new UTF8($str, TRUE);
hexdump($obj->str);
print_r($obj);
echo PHP_EOL;
}
// Unrelated utility function to show us the hex byte values
function hexdump($str, $br=PHP_EOL)
{
if (empty($str)) return FALSE;
// Get the hex byte values in a string
$hex = str_split(implode(NULL, unpack('H*', $str)));
// Allocate bytes into hi and lo nibbles
$hi = NULL;
$lo = NULL;
$mod = 0;
foreach ($hex as $nib)
{
$mod++;
$mod = $mod % 2;
if ($mod) {
$hi .= $nib;
}
else {
$lo .= $nib;
}
}
// Show the scale, the string and the hex
$num = substr('1...5...10...15...20...25...30...35...40...45...50...55...60...65...70...75...80...85...90...95..100..105..110..115..120..125..130', 0, strlen($str));
echo $br . $num;
echo $br . $str;
echo $br . $hi;
echo $br . $lo;
echo $br;
}
|