<?php /** * UTF8 - very fast UTF-8 converter implementation for PHP * * It converts between UTF-8 string and codepoints very fast * Suitable for Standard Compression Scheme for Unicode <http://www.unicode.org/reports/tr6/> * Based on PHP source of UTF8 functions by Henri Sivonen <hsivonen@iki.fi> <http://iki.fi/hsivonen/php-utf8/> * * @author Alexey A.Znaev <znaeff@mail.ru> <http://xbsoft.org> * @copyright Copyright (C) 2011-2012 Alexey A.Znaev * @license http://www.gnu.org/licenses GNU Public License version 3 * @link http://xbsoft.org * @package UTF8 * @version 1.0 */
// -----------------------------------------------------------------------------
/** * Provides methods for very fast UTF-8 convertion * * See UTF8.php File description for full information * * @author Alexey A.Znaev <znaeff@mail.ru> <http://xbsoft.org> * @link http://xbsoft.org * @package UTF8 * @version 1.0 * @since 1.0 */ class UTF8 { /** * Converts UTF-8 string to array of Unicode codepoints * * @param &string $str Reference to UTF-8 string * @param &mixed[] $out Reference to arrray to store the result * @param int $start Staring offset in string * @param int $maxCP Maximal array index, PHP_INT_MAX when omitted * @return int Returns offset of byte next to last converted in string * @throws UTF8_Exception */ public function strToCodepoints(&$str, &$out, $start, $maxCP = PHP_INT_MAX){ $mState = 0; $mUcs4 = 0; $mBytes = 1; $count = 0; $len = strlen($str); for($i = $start; ($count < $maxCP) && ($i < $len); $i++) { $in = ord($str{$i}); if (0 == $mState) { if (0 == (0x80 & ($in))) { $out[] = $in; $mBytes = 1; $count++; } else if (0xC0 == (0xE0 & ($in))) { $mUcs4 = ($in); $mUcs4 = ($mUcs4 & 0x1F) << 6; $mState = 1; $mBytes = 2; } else if (0xE0 == (0xF0 & ($in))) { $mUcs4 = ($in); $mUcs4 = ($mUcs4 & 0x0F) << 12; $mState = 2; $mBytes = 3; } else if (0xF0 == (0xF8 & ($in))) { $mUcs4 = ($in); $mUcs4 = ($mUcs4 & 0x07) << 18; $mState = 3; $mBytes = 4; } else if (0xF8 == (0xFC & ($in))) { $mUcs4 = ($in); $mUcs4 = ($mUcs4 & 0x03) << 24; $mState = 4; $mBytes = 5; } else if (0xFC == (0xFE & ($in))) { $mUcs4 = ($in); $mUcs4 = ($mUcs4 & 1) << 30; $mState = 5; $mBytes = 6; } else throw new UTF8_Exception('Octet #' . $i . '.', UTF8_Exception::INPUT_OCT_RANGE); } else { if (0x80 == (0xC0 & ($in))) { $shift = ($mState - 1) * 6; $tmp = $in; $tmp = ($tmp & 0x0000003F) << $shift; $mUcs4 |= $tmp; if (0 == --$mState) { if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || ((3 == $mBytes) && ($mUcs4 < 0x0800)) || ((4 == $mBytes) && ($mUcs4 < 0x10000)) || (4 < $mBytes) || (($mUcs4 & 0xFFFFF800) == 0xD800) || ($mUcs4 > 0x10FFFF)) throw new UTF8_Exception('Octet #' . $i . '.', UTF8_Exception::INPUT_OCT_BAD); if (0xFEFF != $mUcs4){ $out[] = $mUcs4; $count++; } $mState = 0; $mUcs4 = 0; $mBytes = 1; } } else throw new UTF8_Exception('Octet #' . $i . '.', UTF8_Exception::INPUT_OCT_INCOMPL); } } return $i; }
/** * Converts single Unicode codepoint to UTF-8 string * * @param int $cp The Unicode codepoint value * @return string Returns UTF-8 string containing bytes representing codepoint * @throws UTF8_Exception */ public function codepointToStr($cp){ $res = ''; if($cp < 0) { throw new UTF8_Exception('Codepoint #' . $i . '.', UTF8_Exception::OUTPUT_CP_NEG); } else if ( $cp <= 0x007f) { $res .= chr($cp); } else if ($cp <= 0x07ff) { $res .= chr(0xc0 | ($cp >> 6)); $res .= chr(0x80 | ($cp & 0x003f)); } else if($cp == 0xFEFF) { } else if ($cp >= 0xD800 && $cp <= 0xDFFF) { throw new UTF8_Exception('Codepoint #' . $i . '.', UTF8_Exception::OUTPUT_CP_SUR); } else if ($cp <= 0xffff) { $res .= chr(0xe0 | ($cp >> 12)); $res .= chr(0x80 | (($cp >> 6) & 0x003f)); $res .= chr(0x80 | ($cp & 0x003f)); } else if ($cp <= 0x10ffff) { $res .= chr(0xf0 | ($cp >> 18)); $res .= chr(0x80 | (($cp >> 12) & 0x3f)); $res .= chr(0x80 | (($cp >> 6) & 0x3f)); $res .= chr(0x80 | ($cp & 0x3f)); } else throw new UTF8_Exception('Codepoint #' . $i . '.', UTF8_Exception::OUTPUT_CP_RANGE); return $res; } }
/** * Provides exceptions of UTF-8 converting errors * * See UTF8.php File description for full information * * @author Alexey A.Znaev <znaeff@mail.ru> <http://xbsoft.org> * @link http://xbsoft.org * @package UTF8 * @version 1.0 * @since 1.0 */ class UTF8_Exception extends Exception { const INTERNAL = 0x00; const INPUT = 0x10; const INPUT_OCT_RANGE = 0x11; const INPUT_OCT_BAD = 0x12; const INPUT_OCT_INCOMPL = 0x13; const OUTPUT = 0x20; const OUTPUT_CP_NEG = 0x21; const OUTPUT_CP_SUR = 0x22; const OUTPUT_CP_RANGE = 0x23;
private static $Messages = array( self::INTERNAL => 'Internal error.', self::INPUT => 'Illegal input.', self::INPUT_OCT_RANGE => 'Octet is neither in the US-ASCII range nor a legal first octet of a multi-octet sequence.', self::INPUT_OCT_BAD => 'Illegal non-shortest form or surrogate character or codepoint outside the Unicode range.', self::INPUT_OCT_INCOMPL => 'Incomplete multi-octet sequence.', self::OUTPUT => 'Bad output.', self::OUTPUT_CP_NEG => 'Negative value.', self::OUTPUT_CP_SUR => 'Surrogate value.', self::OUTPUT_CP_RANGE => 'Out of range.' );
public function __construct($message = '', $code = 0x00, Exception $previous = null) { $message_prefix = ''; $code_class = $code & 0xF0; if(array_key_exists($code_class, self::$Messages)) $message_prefix = self::$Messages[$code_class]; if(($code != $code_class) && array_key_exists($code, self::$Messages)) $message_prefix .= ' ' . self::$Messages[$code]; if(!empty($message_prefix)) $message = $message_prefix . ' ' . $message; parent::__construct('UTF8: ' . $message, $code, $previous); } }
?>
|