<?php
/**
* UTF8 - very fast UTF-8 converter implementation for PHP
*
* It converts between UTF-8 string and codepoints very fast
* Suitable for Standard Compression Scheme for Unicode <http://www.unicode.org/reports/tr6/>
* Based on PHP source of UTF8 functions by Henri Sivonen <hsivonen@iki.fi> <http://iki.fi/hsivonen/php-utf8/>
*
* @author Alexey A.Znaev <znaeff@mail.ru> <http://xbsoft.org>
* @copyright Copyright (C) 2011-2012 Alexey A.Znaev
* @license http://www.gnu.org/licenses GNU Public License version 3
* @link http://xbsoft.org
* @package UTF8
* @version 1.0
*/
// -----------------------------------------------------------------------------
/**
* Provides methods for very fast UTF-8 convertion
*
* See UTF8.php File description for full information
*
* @author Alexey A.Znaev <znaeff@mail.ru> <http://xbsoft.org>
* @link http://xbsoft.org
* @package UTF8
* @version 1.0
* @since 1.0
*/
class UTF8 {
/**
* Converts UTF-8 string to array of Unicode codepoints
*
* @param &string $str Reference to UTF-8 string
* @param &mixed[] $out Reference to arrray to store the result
* @param int $start Staring offset in string
* @param int $maxCP Maximal array index, PHP_INT_MAX when omitted
* @return int Returns offset of byte next to last converted in string
* @throws UTF8_Exception
*/
public function strToCodepoints(&$str, &$out, $start, $maxCP = PHP_INT_MAX){
$mState = 0;
$mUcs4 = 0;
$mBytes = 1;
$count = 0;
$len = strlen($str);
for($i = $start; ($count < $maxCP) && ($i < $len); $i++) {
$in = ord($str{$i});
if (0 == $mState) {
if (0 == (0x80 & ($in))) {
$out[] = $in;
$mBytes = 1;
$count++;
} else if (0xC0 == (0xE0 & ($in))) {
$mUcs4 = ($in);
$mUcs4 = ($mUcs4 & 0x1F) << 6;
$mState = 1;
$mBytes = 2;
} else if (0xE0 == (0xF0 & ($in))) {
$mUcs4 = ($in);
$mUcs4 = ($mUcs4 & 0x0F) << 12;
$mState = 2;
$mBytes = 3;
} else if (0xF0 == (0xF8 & ($in))) {
$mUcs4 = ($in);
$mUcs4 = ($mUcs4 & 0x07) << 18;
$mState = 3;
$mBytes = 4;
} else if (0xF8 == (0xFC & ($in))) {
$mUcs4 = ($in);
$mUcs4 = ($mUcs4 & 0x03) << 24;
$mState = 4;
$mBytes = 5;
} else if (0xFC == (0xFE & ($in))) {
$mUcs4 = ($in);
$mUcs4 = ($mUcs4 & 1) << 30;
$mState = 5;
$mBytes = 6;
} else throw new UTF8_Exception('Octet #' . $i . '.', UTF8_Exception::INPUT_OCT_RANGE);
} else {
if (0x80 == (0xC0 & ($in))) {
$shift = ($mState - 1) * 6;
$tmp = $in;
$tmp = ($tmp & 0x0000003F) << $shift;
$mUcs4 |= $tmp;
if (0 == --$mState) {
if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
(4 < $mBytes) ||
(($mUcs4 & 0xFFFFF800) == 0xD800) ||
($mUcs4 > 0x10FFFF)) throw new UTF8_Exception('Octet #' . $i . '.', UTF8_Exception::INPUT_OCT_BAD);
if (0xFEFF != $mUcs4){
$out[] = $mUcs4;
$count++;
}
$mState = 0;
$mUcs4 = 0;
$mBytes = 1;
}
} else throw new UTF8_Exception('Octet #' . $i . '.', UTF8_Exception::INPUT_OCT_INCOMPL);
}
}
return $i;
}
/**
* Converts single Unicode codepoint to UTF-8 string
*
* @param int $cp The Unicode codepoint value
* @return string Returns UTF-8 string containing bytes representing codepoint
* @throws UTF8_Exception
*/
public function codepointToStr($cp){
$res = '';
if($cp < 0) {
throw new UTF8_Exception('Codepoint #' . $i . '.', UTF8_Exception::OUTPUT_CP_NEG);
} else if ( $cp <= 0x007f) {
$res .= chr($cp);
} else if ($cp <= 0x07ff) {
$res .= chr(0xc0 | ($cp >> 6));
$res .= chr(0x80 | ($cp & 0x003f));
} else if($cp == 0xFEFF) {
} else if ($cp >= 0xD800 && $cp <= 0xDFFF) {
throw new UTF8_Exception('Codepoint #' . $i . '.', UTF8_Exception::OUTPUT_CP_SUR);
} else if ($cp <= 0xffff) {
$res .= chr(0xe0 | ($cp >> 12));
$res .= chr(0x80 | (($cp >> 6) & 0x003f));
$res .= chr(0x80 | ($cp & 0x003f));
} else if ($cp <= 0x10ffff) {
$res .= chr(0xf0 | ($cp >> 18));
$res .= chr(0x80 | (($cp >> 12) & 0x3f));
$res .= chr(0x80 | (($cp >> 6) & 0x3f));
$res .= chr(0x80 | ($cp & 0x3f));
} else throw new UTF8_Exception('Codepoint #' . $i . '.', UTF8_Exception::OUTPUT_CP_RANGE);
return $res;
}
}
/**
* Provides exceptions of UTF-8 converting errors
*
* See UTF8.php File description for full information
*
* @author Alexey A.Znaev <znaeff@mail.ru> <http://xbsoft.org>
* @link http://xbsoft.org
* @package UTF8
* @version 1.0
* @since 1.0
*/
class UTF8_Exception extends Exception {
const INTERNAL = 0x00;
const INPUT = 0x10;
const INPUT_OCT_RANGE = 0x11;
const INPUT_OCT_BAD = 0x12;
const INPUT_OCT_INCOMPL = 0x13;
const OUTPUT = 0x20;
const OUTPUT_CP_NEG = 0x21;
const OUTPUT_CP_SUR = 0x22;
const OUTPUT_CP_RANGE = 0x23;
private static $Messages = array(
self::INTERNAL => 'Internal error.',
self::INPUT => 'Illegal input.',
self::INPUT_OCT_RANGE => 'Octet is neither in the US-ASCII range nor a legal first octet of a multi-octet sequence.',
self::INPUT_OCT_BAD => 'Illegal non-shortest form or surrogate character or codepoint outside the Unicode range.',
self::INPUT_OCT_INCOMPL => 'Incomplete multi-octet sequence.',
self::OUTPUT => 'Bad output.',
self::OUTPUT_CP_NEG => 'Negative value.',
self::OUTPUT_CP_SUR => 'Surrogate value.',
self::OUTPUT_CP_RANGE => 'Out of range.'
);
public function __construct($message = '', $code = 0x00, Exception $previous = null) {
$message_prefix = '';
$code_class = $code & 0xF0;
if(array_key_exists($code_class, self::$Messages)) $message_prefix = self::$Messages[$code_class];
if(($code != $code_class) && array_key_exists($code, self::$Messages)) $message_prefix .= ' ' . self::$Messages[$code];
if(!empty($message_prefix)) $message = $message_prefix . ' ' . $message;
parent::__construct('UTF8: ' . $message, $code, $previous);
}
}
?>
|