<?php
/**
* SCSU - Standard Compression Scheme for Unicode implementation for PHP
*
* It compress/decompress UTF-8 string to/from SCSU-compressed string
* Suitable for better LZF compression for UTF-8 strings
* Based on Java source of SCSU by Unicode, Inc. <http://www.unicode.org/reports/tr6/>
*
* @author Alexey A.Znaev <znaeff@mail.ru> <http://xbsoft.org>
* @copyright Copyright (C) 2011-2012 Alexey A.Znaev
* @license http://www.gnu.org/licenses GNU Public License version 3
* @link http://xbsoft.org
* @package SCSU
* @version 1.0
*/
// -----------------------------------------------------------------------------
/**
* Requires UTF8.php
*/
require_once 'UTF8.php';
/**
* Provides methods for SCSU compression/decompression
*
* See SCSU.php File description for full information
*
* @author Alexey A.Znaev <znaeff@mail.ru> <http://xbsoft.org>
* @link http://xbsoft.org
* @package SCSU
* @version 1.0
* @since 1.0
*/
class SCSU {
// class constants
const SQ0 = 0x01;
const SQ1 = 0x02;
const SQ2 = 0x03;
const SQ3 = 0x04;
const SQ4 = 0x05;
const SQ5 = 0x06;
const SQ6 = 0x07;
const SQ7 = 0x08;
const SDX = 0x0B;
const Srs = 0x0C;
const SQU = 0x0E;
const SCU = 0x0F;
const SC0 = 0x10;
const SC1 = 0x11;
const SC2 = 0x12;
const SC3 = 0x13;
const SC4 = 0x14;
const SC5 = 0x15;
const SC6 = 0x16;
const SC7 = 0x17;
const SD0 = 0x18;
const SD1 = 0x19;
const SD2 = 0x1A;
const SD3 = 0x1B;
const SD4 = 0x1C;
const SD5 = 0x1D;
const SD6 = 0x1E;
const SD7 = 0x1F;
const UC0 = 0xE0;
const UC1 = 0xE1;
const UC2 = 0xE2;
const UC3 = 0xE3;
const UC4 = 0xE4;
const UC5 = 0xE5;
const UC6 = 0xE6;
const UC7 = 0xE7;
const UD0 = 0xE8;
const UD1 = 0xE9;
const UD2 = 0xEA;
const UD3 = 0xEB;
const UD4 = 0xEC;
const UD5 = 0xED;
const UD6 = 0xEE;
const UD7 = 0xEF;
const UQU = 0xF0;
const UDX = 0xF1;
const Urs = 0xF2;
const gapThreshold = 0x68;
const gapOffset = 0xAC00;
const reservedStart = 0xA8;
const fixedThreshold = 0xF9;
const cpMaxNum = 10000;
// private class fields
private $dynamicOffset;
private $selectedWindow;
private $iIn = 0;
private $iInLen = 0;
private $iSCU = -1;
private $fUnicodeMode = false;
private $iNextWindow = 3;
private static $staticOffset = array(
0x0000, 0x0080, 0x0100, 0x0300, 0x2000, 0x2080, 0x2100, 0x3000
);
private static $initialDynamicOffset = array(
0x0080, 0x00C0, 0x0400, 0x0600, 0x0900, 0x3040, 0x30A0, 0xFF00
);
private static $fixedOffset = array(
0x00C0, 0x0250, 0x0370, 0x0530, 0x3040, 0x30A0, 0xFF60
);
// methods
// public methods
// compression related public methods
/**
* Compresses UTF-8 string using to SCSU algorithm
*
* @param &string $sIn Reference to UTF-8 string
* @return string SCSU-compressed string
* @throws SCSU_Exception
*/
public function compress(&$sIn) {
$this->reset();
$this->fUnicodeMode = false;
$this->iSCU = - 1;
$this->iNextWindow = 3;
$this->sOut = '';
$inLen = strlen($sIn);
$inNext = 0;
do{
$this->aIn = array();
$inNext = UTF8::strToCodepoints($sIn, $this->aIn, $inNext, self::cpMaxNum);
$this->compress_part();
}while($inNext < $inLen);
unset($this->aIn);
return $this->sOut;
}
// decompression related public methods
/**
* Decompresses SCSU-compressed string to UTF-8 one
*
* @param &string $sIn Reference to SCSU-compressed string
* @return string UTF-8 string
* @throws SCSU_Exception
*/
public function decompress(&$sIn) {
$this->reset();
$iInLen = strlen($sIn);
$sOut = '';
for( $iCur = 0; $iCur < $iInLen; $iCur++ ) {
$iStaticWindow = 0;
$iDynamicWindow = $this->selectedWindow;
switch(ord($sIn{$iCur})) {
case self::SQ0:
case self::SQ1:
case self::SQ2:
case self::SQ3:
case self::SQ4:
case self::SQ5:
case self::SQ6:
case self::SQ7:
if( $iCur >= $iInLen - 1) break 2;
$iDynamicWindow = $iStaticWindow = ord($sIn{$iCur}) - self::SQ0;
$iCur ++;
default:
if(ord($sIn{$iCur}) < 128) {
$ch = ord($sIn{$iCur}) + self::$staticOffset[$iStaticWindow];
$sOut .= UTF8::codepointToStr($ch);
} else {
$ch = ord($sIn{$iCur});
$ch -= 0x80;
$ch += $this->dynamicOffset[$iDynamicWindow];
if ($ch < 1<<16) {
$sOut .= UTF8::codepointToStr($ch);
} else {
$ch -= 0x10000;
$sOut .= UTF8::codepointToStr(0xD800 + ($ch>>10));
$sOut .= UTF8::codepointToStr(0xDC00 + ($ch & ~0xFC00));
}
}
break;
case self::SDX:
$iCur += 2;
if( $iCur >= $iInLen) break 2;
$this->defineExtendedWindow($this->charFromTwoBytes(ord($sIn{$iCur-1}), ord($sIn{$iCur})));
break;
case self::SD0:
case self::SD1:
case self::SD2:
case self::SD3:
case self::SD4:
case self::SD5:
case self::SD6:
case self::SD7:
$iCur ++;
if( $iCur >= $iInLen) break 2;
$this->defineWindow(ord($sIn{$iCur-1}) - self::SD0, ord($sIn{$iCur}));
break;
case self::SC0:
case self::SC1:
case self::SC2:
case self::SC3:
case self::SC4:
case self::SC5:
case self::SC6:
case self::SC7:
$this->selectedWindow = ord($sIn{$iCur}) - self::SC0;
break;
case self::SCU:
$iCur++;
for(; $iCur < $iInLen - 1; $iCur+=2 ) {
$b = ord($sIn{$iCur});
if ($b >= self::UC0 && $b <= self::UC7) {
$this->selectedWindow = $b - self::UC0;
break 2;
} else if ($b >= self::UD0 && $b <= self::UD7) {
$this->defineWindow($b - self::UD0, ord($sIn{$iCur + 1}));
$iCur++;
break 2;
} else if ($b == self::UDX) {
$this->defineExtendedWindow($this->charFromTwoBytes(ord($sIn{$iCur+1}), ord($sIn{$iCur+2})));
$iCur += 2;
break 2;
} else if ($b == self::UQU) $iCur++;
$ch = $this->charFromTwoBytes(ord($sIn{$iCur}), ord($sIn{$iCur+1}));
$sOut .= UTF8::codepointToStr($ch);
}
if( $iCur != $iInLen) throw new SCSU_Exception('', SCSU_Exception::INPUT_ENDED);
break;
case self::SQU:
$iCur += 2;
if( $iCur >= $iInLen) {
break 2;
} else {
$ch = $this->charFromTwoBytes(ord($sIn{$iCur-1}), ord($sIn{$iCur}));
$sOut .= UTF8::codepointToStr($ch);
}
break;
case self::Srs:
throw new SCSU_Exception('Pos. ' . $iCur . '.', SCSU_Exception::INPUT_SRS);
}
}
if( $iCur < $iInLen) throw new SCSU_Exception('', SCSU_Exception::INPUT_ENDED);
return $sOut;
}
// private methods
// common private methods
private function isCompressible($ch) {
return ($ch < 0x3400 || $ch >= 0xE000);
}
private function reset() {
$this->selectedWindow = 0;
$this->dynamicOffset = array();
array_push( $this->dynamicOffset, self::$initialDynamicOffset[0] );
array_push( $this->dynamicOffset, self::$initialDynamicOffset[1] );
array_push( $this->dynamicOffset, self::$initialDynamicOffset[2] );
array_push( $this->dynamicOffset, self::$initialDynamicOffset[3] );
array_push( $this->dynamicOffset, self::$initialDynamicOffset[4] );
array_push( $this->dynamicOffset, self::$initialDynamicOffset[5] );
array_push( $this->dynamicOffset, self::$initialDynamicOffset[6] );
array_push( $this->dynamicOffset, self::$initialDynamicOffset[7] );
}
private function charFromTwoBytes($hi, $lo) {
$ch = ($lo >= 0 ? $lo : 256 + $lo);
return ($ch + (($hi >= 0 ? $hi : 256 + $hi)<<8));
}
// compression related private methods
private function compress_part() {
$this->iInLen = count($this->aIn);
$this->iIn = 0;
$ch = 0;
while ($this->iIn < $this->iInLen) {
if ($this->iSCU != -1) {
$ch = $this->outputUnicodeRun();
if (strlen($this->sOut) - $this->iSCU == 3 ) {
$this->sOut{$this->iSCU} = chr(self::SQU);
$this->iSCU = -1;
continue;
} else {
$this->iSCU = -1;
$this->fUnicodeMode = true;
}
} else $ch = $this->outputSingleByteRun($this->aIn);
if ($this->iIn == $this->iInLen) break;
for ($ich = $this->iIn; $ch < 0x80; $ich++) {
if ($ich == $this->iInLen || !$this->isCompressible($this->aIn[$ich])) {
$ch = $this->aIn[$this->iIn];
break;
}
$ch = $this->aIn[$ich];
}
$iprevWindow = $this->selectedWindow;
if ($ch < 0x80 || $this->locateWindow($ch, $this->dynamicOffset)) {
if(!$this->fUnicodeMode && $this->iIn < $this->iInLen - 1) {
$ch2 = $this->aIn[$this->iIn + 1];
if ($ch2 >= $this->dynamicOffset[$iprevWindow] &&
$ch2 < $this->dynamicOffset[$iprevWindow] + 0x80)
{
$this->quoteSingleByte($ch);
$this->selectedWindow = $iprevWindow;
continue;
}
}
$this->sOut .= chr(($this->fUnicodeMode ? self::UC0 : self::SC0) + $this->selectedWindow);
$this->fUnicodeMode = false;
} else if (!$this->fUnicodeMode && $this->locateWindow($ch, self::$staticOffset)) {
$this->quoteSingleByte($ch);
$this->selectedWindow = $iprevWindow;
continue;
} else if ($this->positionWindow($ch) ) {
$this->fUnicodeMode = false;
} else {
$this->iSCU = strlen($this->sOut);
$this->sOut .= chr(self::SCU);
continue;
}
}
}
private function locateWindow($ch, &$offsetTable){
$iWin = $this->selectedWindow;
if ($iWin != - 1 && $ch >= $offsetTable[$iWin] && $ch < $offsetTable[$iWin] + 0x80) return true;
$offsetTableLen = count($offsetTable);
for ($iWin = 0; $iWin < $offsetTableLen; $iWin++) {
if ($ch >= $offsetTable[$iWin] && $ch < $offsetTable[$iWin] + 0x80) {
$this->selectedWindow = $iWin;
return true;
}
}
return false;
}
private function isAsciiCrLfOrTab($ch){
return ($ch >= 0x20 && $ch <= 0x7F) || $ch == 0x09 || $ch == 0x0A || $ch == 0x0D;
}
private function outputSingleByteRun(){
$iWin = $this->selectedWindow;
while($this->iIn < $this->iInLen){
$this->iOutLen = 0;
$byte1 = 0;
$byte2 = 0;
$ch = $this->aIn[$this->iIn];
$inlen = 1;
if ( ($ch & 0xF800) == 0xD800 ){
if ( ($ch & 0xFC00) == 0xDC00 ){
throw new SCSU_Exception('Byte #' . $this->iIn . '.', SCSU_Exception::INPUT_UNP_LOW);
} else {
if ( $this->iIn >= $this->iInLen - 1) throw new SCSU_Exception('', SCSU_Exception::INPUT_ENDED);
$ch2 = $this->aIn[$this->iIn + 1];
if ( ($ch2 & 0xFC00) != 0xDC00 ) throw new SCSU_Exception('Byte #' . ($this->iIn + 1) . '.', SCSU_Exception::INPUT_UNP_HIGH);
$ch = (($ch - 0xD800)<<10 | ($ch2-0xDC00)) + 0x10000;
$inlen = 2;
}
}
if ($this->isAsciiCrLfOrTab($ch) || $ch == 0){
$byte2 = $ch & 0x7F;
$this->iOutLen = 1;
} else if ($ch < 0x20) {
$byte1 = self::SQ0;
$byte2 = $ch & 255;
$this->iOutLen = 2;
} else if ($ch >= $this->dynamicOffset[$iWin] && $ch < $this->dynamicOffset[$iWin] + 0x80) {
$ch -= $this->dynamicOffset[$iWin];
$byte2 = $ch | 0x80;
$this->iOutLen = 1;
}
switch($this->iOutLen) {
default:
return $ch;
case 2:
$this->sOut .= chr($byte1);
case 1:
$this->sOut .= chr($byte2);
break;
}
$this->iIn += $inlen;
}
return 0;
}
private function quoteSingleByte($ch) {
$iWin = $this->selectedWindow;
$this->sOut .= chr(( self::SQ0 + $iWin ) & 255);
if ($ch >= $this->dynamicOffset[$iWin] && $ch < $this->dynamicOffset[$iWin] + 0x80) {
$ch -= $this->dynamicOffset[$iWin];
$this->sOut .= chr($ch | 0x80);
} else if ($ch >= self::$staticOffset[$iWin] && $ch < self::$staticOffset[$iWin] + 0x80) {
$ch -= self::$staticOffset[$iWin];
$this->sOut .= chr(( $ch ) & 255);
} else throw new SCSU_Exception('ch = ' . $ch . ' not valid in quoteSingleByte.');
$this->iIn++;
}
private function outputUnicodeRun() {
$ch = 0;
while($this->iIn < $this->iInLen) {
$ch = $this->aIn[$this->iIn];
$this->iOutLen = 2;
if ($this->isCompressible($ch)) {
if( $this->iIn < $this->iInLen - 1) {
$ch2 = $this->aIn[$this->iIn + 1];
if ($this->isCompressible($ch2)) break;
}
if ($ch >= 0xE000 && $ch <= 0xF2FF) $this->iOutLen = 3;
}
if ($this->iOutLen == 3) $this->sOut .= chr(self::UQU);
$this->sOut .= chr($ch >> 8);
$this->sOut .= chr($ch & 0xFF);
$this->iIn++;
}
return $ch;
}
private function positionWindow($ch) {
$iWin = $this->iNextWindow % 8;
$iPosition = 0;
if ($ch < 0x80) throw new SCSU_Exception('ch < 0x80.');
for ($i = 0; $i < count(self::$fixedOffset); $i++) {
if ($ch >= self::$fixedOffset[$i] && $ch < self::$fixedOffset[$i] + 0x80) {
$iPosition = $i;
break;
}
}
if ($iPosition != 0) {
$this->dynamicOffset[$iWin] = self::$fixedOffset[$iPosition];
$iPosition += 0xF9;
} else if ($ch < 0x3400) {
$iPosition = $ch >> 7;
$this->dynamicOffset[$iWin] = $ch & 0xFF80;
} else if ($ch < 0xE000) {
return false;
} else if ($ch <= 0xFFFF) {
$iPosition = (($ch - self::gapOffset) >> 7);
$this->dynamicOffset[$iWin] = $ch & 0xFF80;
} else {
$iPosition = ($ch - 0x10000) >> 7;
$iPosition |= $iWin << 13;
$this->dynamicOffset[$iWin] = $ch & 0x1FFF80;
}
if ( $iPosition < 0x100) {
$this->sOut .= chr(($this->fUnicodeMode ? self::UD0 : self::SD0) + $iWin);
$this->sOut .= chr($iPosition & 0xFF);
} else if ( $iPosition >= 0x100 ) {
$this->sOut .= chr($this->fUnicodeMode ? self::UDX : self::SDX);
$this->sOut .= chr(($iPosition >> 8) & 0xFF);
$this->sOut .= chr($iPosition & 0xFF);
}
$this->selectedWindow = $iWin;
$this->iNextWindow++;
return true;
}
// decompression related private methods
private function defineWindow($iWindow, $bOffset) {
$iOffset = ($bOffset < 0 ? $bOffset + 256 : $bOffset);
if ($iOffset == 0) {
throw new SCSU_Exception('', SCSU_Exception::INPUT_OFF_ZERO);
} else if ($iOffset < self::gapThreshold) {
$this->dynamicOffset[$iWindow] = $iOffset << 7;
} else if ($iOffset < self::reservedStart) {
$this->dynamicOffset[$iWindow] = ($iOffset << 7) + self::gapOffset;
} else if ($iOffset < self::fixedThreshold) {
throw new SCSU_Exception('Value = ' . $iOffset . '.', SCSU_Exception::INPUT_OFF_BAD);
} else {
$this->dynamicOffset[$iWindow] = self::$fixedOffset[$iOffset - self::fixedThreshold];
}
$this->selectedWindow = $iWindow;
}
private function defineExtendedWindow($chOffset) {
$iWindow = $chOffset >> 13;
$this->dynamicOffset[$iWindow] = (($chOffset & 0x1FFF) << 7) + (1 << 16);
$this->selectedWindow = $iWindow;
}
}
/**
* Provides exceptions of SCSU errors
*
* See SCSU.php File description for full information
*
* @author Alexey A.Znaev <znaeff@mail.ru> <http://xbsoft.org>
* @link http://xbsoft.org
* @package SCSU
* @version 1.0
* @since 1.0
*/
class SCSU_Exception extends Exception {
const INTERNAL = 0x00;
const INPUT = 0x10;
const INPUT_ENDED = 0x11;
const INPUT_UNP_LOW = 0x12;
const INPUT_UNP_HIGH = 0x13;
const INPUT_OFF_ZERO = 0x14;
const INPUT_OFF_BAD = 0x15;
const INPUT_SRS = 0x16;
const OUTPUT = 0x20;
private static $Messages = array(
self::INTERNAL => 'Internal error.',
self::INPUT => 'Illegal input.',
self::INPUT_ENDED => 'Ended prematurely.',
self::INPUT_UNP_LOW => 'Unpaired low surrogate.',
self::INPUT_UNP_HIGH => 'Unpaired high surrogate.',
self::INPUT_OFF_ZERO => 'Zero offset.',
self::INPUT_OFF_BAD => 'Bad offset.',
self::INPUT_SRS => 'Srs byte found.',
self::OUTPUT => 'Bad output.',
);
public function __construct($message = '', $code = 0x00, Exception $previous = null) {
$message_prefix = '';
$code_class = $code & 0xF0;
if(array_key_exists($code_class, self::$Messages)) $message_prefix = self::$Messages[$code_class];
if(($code != $code_class) && array_key_exists($code, self::$Messages)) $message_prefix .= ' ' . self::$Messages[$code];
if(!empty($message_prefix)) $message = $message_prefix . ' ' . $message;
parent::__construct('SCSU: ' . $message, $code, $previous);
}
}
?>
|