Login   Register  
PHP Classes
elePHPant
Icontem

File: SCSU.php

Recommend this page to a friend!
Stumble It! Stumble It! Bookmark in del.icio.us Bookmark in del.icio.us
  Classes of Alexey Znaev  >  SCSU  >  SCSU.php  >  Download  
File: SCSU.php
Role: Class source
Content type: text/plain
Description: SCSU - main class
Class: SCSU
Encode and decode Unicode strings with SCSU
Author: By
Last change:
Date: 2012-04-30 07:19
Size: 18,960 bytes
 

Contents

Class file image Download
<?php
/**
 * SCSU - Standard Compression Scheme for Unicode implementation for PHP
 *
 * It compress/decompress UTF-8 string to/from SCSU-compressed string
 * Suitable for better LZF compression for UTF-8 strings
 * Based on Java source of SCSU by Unicode, Inc. <http://www.unicode.org/reports/tr6/>
 *
 * @author 	Alexey A.Znaev <znaeff@mail.ru> <http://xbsoft.org>
 * @copyright 	Copyright (C) 2011-2012 Alexey A.Znaev
 * @license 	http://www.gnu.org/licenses GNU Public License version 3
 * @link 	http://xbsoft.org
 * @package 	SCSU
 * @version 	1.0
 */
 
// -----------------------------------------------------------------------------

/**
 * Requires UTF8.php
 */
require_once 'UTF8.php';

/**
 * Provides methods for SCSU compression/decompression
 *
 * See SCSU.php File description for full information
 *
 * @author 	Alexey A.Znaev <znaeff@mail.ru> <http://xbsoft.org>
 * @link 	http://xbsoft.org
 * @package 	SCSU
 * @version 	1.0
 * @since 	1.0
 */
class SCSU {

    // class constants
    
    const SQ0 = 0x01;
    const SQ1 = 0x02;
    const SQ2 = 0x03;
    const SQ3 = 0x04;
    const SQ4 = 0x05;
    const SQ5 = 0x06;
    const SQ6 = 0x07;
    const SQ7 = 0x08;

    const SDX = 0x0B;
    const Srs = 0x0C;

    const SQU = 0x0E;
    const SCU = 0x0F;

    const SC0 = 0x10;
    const SC1 = 0x11;
    const SC2 = 0x12;
    const SC3 = 0x13;
    const SC4 = 0x14;
    const SC5 = 0x15;
    const SC6 = 0x16;
    const SC7 = 0x17;
    const SD0 = 0x18;
    const SD1 = 0x19;
    const SD2 = 0x1A;
    const SD3 = 0x1B;
    const SD4 = 0x1C;
    const SD5 = 0x1D;
    const SD6 = 0x1E;
    const SD7 = 0x1F;

    const UC0 = 0xE0;
    const UC1 = 0xE1;
    const UC2 = 0xE2;
    const UC3 = 0xE3;
    const UC4 = 0xE4;
    const UC5 = 0xE5;
    const UC6 = 0xE6;
    const UC7 = 0xE7;
    const UD0 = 0xE8;
    const UD1 = 0xE9;
    const UD2 = 0xEA;
    const UD3 = 0xEB;
    const UD4 = 0xEC;
    const UD5 = 0xED;
    const UD6 = 0xEE;
    const UD7 = 0xEF;

    const UQU = 0xF0;
    const UDX = 0xF1;
    const Urs = 0xF2;

    const gapThreshold = 0x68;
    const gapOffset = 0xAC00;
    const reservedStart = 0xA8;
    const fixedThreshold = 0xF9;

    const cpMaxNum = 10000;

    // private class fields

    private $dynamicOffset;
    private $selectedWindow;
    private $iIn = 0;
    private $iInLen = 0;
    private $iSCU = -1;
    private $fUnicodeMode = false;
    private $iNextWindow = 3;

    private static $staticOffset = array(
        0x0000, 0x0080, 0x0100, 0x0300, 0x2000, 0x2080, 0x2100, 0x3000 
    );

    private static $initialDynamicOffset = array(
        0x0080, 0x00C0, 0x0400, 0x0600, 0x0900, 0x3040, 0x30A0, 0xFF00 
    );

    private static $fixedOffset = array(
        0x00C0, 0x0250, 0x0370, 0x0530, 0x3040, 0x30A0, 0xFF60 
    );

    // methods
    
    // public methods
    
    // compression related public methods

    /**
     * Compresses UTF-8 string using to SCSU algorithm
     *
     * @param 		&string $sIn Reference to UTF-8 string
     * @return 		string SCSU-compressed string
     * @throws 		SCSU_Exception
     */
    public function compress(&$sIn) {
        $this->reset();
        $this->fUnicodeMode = false;
        $this->iSCU = - 1;
	$this->iNextWindow = 3;
        $this->sOut = '';
	$inLen = strlen($sIn);
	$inNext = 0;
	do{
    	    $this->aIn = array();
	    $inNext = UTF8::strToCodepoints($sIn, $this->aIn, $inNext, self::cpMaxNum);
	    $this->compress_part();
	}while($inNext < $inLen);
	unset($this->aIn);
	return $this->sOut;
    }

    // decompression related public methods

    /**
     * Decompresses SCSU-compressed string to UTF-8 one
     *
     * @param 		&string $sIn Reference to SCSU-compressed string
     * @return 		string UTF-8 string
     * @throws 		SCSU_Exception
     */
    public function decompress(&$sIn) {
        $this->reset();
        $iInLen = strlen($sIn);
        $sOut = '';
       
        for( $iCur = 0; $iCur < $iInLen; $iCur++ ) {
            $iStaticWindow = 0;
            $iDynamicWindow = $this->selectedWindow;

            switch(ord($sIn{$iCur})) {
            case self::SQ0:
            case self::SQ1:
            case self::SQ2:
            case self::SQ3:
            case self::SQ4:
            case self::SQ5:
            case self::SQ6:
            case self::SQ7:
                if( $iCur >= $iInLen - 1) break 2; 
                $iDynamicWindow = $iStaticWindow = ord($sIn{$iCur}) - self::SQ0;
                $iCur ++;
            default:
                if(ord($sIn{$iCur}) < 128) {
                    $ch = ord($sIn{$iCur}) + self::$staticOffset[$iStaticWindow];
                    $sOut .= UTF8::codepointToStr($ch);
                } else {
                    $ch = ord($sIn{$iCur});
                    $ch -= 0x80;               
                    $ch += $this->dynamicOffset[$iDynamicWindow];

                    if ($ch < 1<<16) {
                	$sOut .= UTF8::codepointToStr($ch);
                    } else {
                        $ch -= 0x10000;
                	$sOut .= UTF8::codepointToStr(0xD800 + ($ch>>10));
                	$sOut .= UTF8::codepointToStr(0xDC00 + ($ch & ~0xFC00));
                    }
                }
                break;
            case self::SDX:
                $iCur += 2;
                if( $iCur >= $iInLen) break 2; 
                $this->defineExtendedWindow($this->charFromTwoBytes(ord($sIn{$iCur-1}), ord($sIn{$iCur})));
                break;
            case self::SD0:
            case self::SD1:
            case self::SD2:
            case self::SD3:
            case self::SD4:
            case self::SD5:
            case self::SD6:
            case self::SD7:
                $iCur ++;
                if( $iCur >= $iInLen) break 2; 
                $this->defineWindow(ord($sIn{$iCur-1}) - self::SD0, ord($sIn{$iCur}));
                break;
            case self::SC0:
            case self::SC1:
            case self::SC2:
            case self::SC3:
            case self::SC4:
            case self::SC5:
            case self::SC6:
            case self::SC7:
                $this->selectedWindow = ord($sIn{$iCur}) - self::SC0;
                break;
            case self::SCU:
		$iCur++;
    		for(; $iCur < $iInLen - 1; $iCur+=2 ) {
        	    $b = ord($sIn{$iCur});
        	    if ($b >= self::UC0 && $b <= self::UC7) {
            		$this->selectedWindow = $b - self::UC0;
			break 2;
        	    } else if ($b >= self::UD0 && $b <= self::UD7) {
            		$this->defineWindow($b - self::UD0, ord($sIn{$iCur + 1}));
			$iCur++;
			break 2;
        	    } else if ($b == self::UDX) {
            		$this->defineExtendedWindow($this->charFromTwoBytes(ord($sIn{$iCur+1}), ord($sIn{$iCur+2})));
			$iCur += 2;
			break 2;
        	    } else if ($b == self::UQU) $iCur++;
        	    $ch = $this->charFromTwoBytes(ord($sIn{$iCur}), ord($sIn{$iCur+1}));
        	    $sOut .= UTF8::codepointToStr($ch);
    		}
    		if( $iCur != $iInLen) throw new SCSU_Exception('', SCSU_Exception::INPUT_ENDED);
                break;
            case self::SQU:
                $iCur += 2;
                if( $iCur >= $iInLen) {
		    break 2;
		} else {
                    $ch = $this->charFromTwoBytes(ord($sIn{$iCur-1}), ord($sIn{$iCur}));
                    $sOut .= UTF8::codepointToStr($ch);
                }
                break;
             case self::Srs:
                throw new SCSU_Exception('Pos. ' . $iCur . '.', SCSU_Exception::INPUT_SRS);
            }
        }
        if( $iCur < $iInLen) throw new SCSU_Exception('', SCSU_Exception::INPUT_ENDED);
	return $sOut;
    }

    // private methods
    
    // common private methods

    private function isCompressible($ch) {
        return ($ch < 0x3400 || $ch >= 0xE000);
    }

    private function reset() { 
        $this->selectedWindow = 0;
        $this->dynamicOffset = array();
	array_push( $this->dynamicOffset, self::$initialDynamicOffset[0] );
	array_push( $this->dynamicOffset, self::$initialDynamicOffset[1] );
	array_push( $this->dynamicOffset, self::$initialDynamicOffset[2] );
	array_push( $this->dynamicOffset, self::$initialDynamicOffset[3] );
	array_push( $this->dynamicOffset, self::$initialDynamicOffset[4] );
	array_push( $this->dynamicOffset, self::$initialDynamicOffset[5] );
	array_push( $this->dynamicOffset, self::$initialDynamicOffset[6] );
	array_push( $this->dynamicOffset, self::$initialDynamicOffset[7] );
    }

    private function charFromTwoBytes($hi, $lo) {
        $ch = ($lo >= 0 ? $lo : 256 + $lo);
        return ($ch + (($hi >= 0 ? $hi : 256 + $hi)<<8));
    }

    // compression related private methods

    private function compress_part() {
        $this->iInLen = count($this->aIn);
        $this->iIn = 0;
        $ch = 0;
        while ($this->iIn < $this->iInLen) {
            if ($this->iSCU != -1) {
                $ch = $this->outputUnicodeRun();
                if (strlen($this->sOut) - $this->iSCU == 3 ) {
                    $this->sOut{$this->iSCU} = chr(self::SQU);
                    $this->iSCU = -1;
                    continue;
                } else {
                    $this->iSCU = -1;
                    $this->fUnicodeMode = true;
                }
            } else $ch = $this->outputSingleByteRun($this->aIn);
           
            if ($this->iIn == $this->iInLen) break;
           
            for ($ich = $this->iIn; $ch < 0x80; $ich++) {
                if ($ich == $this->iInLen || !$this->isCompressible($this->aIn[$ich])) {
                    $ch = $this->aIn[$this->iIn];
                    break;
                }
                $ch = $this->aIn[$ich];
            }

            $iprevWindow = $this->selectedWindow;
           
            if ($ch < 0x80 || $this->locateWindow($ch, $this->dynamicOffset)) {
                if(!$this->fUnicodeMode && $this->iIn < $this->iInLen - 1) {
                    $ch2 = $this->aIn[$this->iIn + 1];
                    if ($ch2 >= $this->dynamicOffset[$iprevWindow] &&
                        $ch2 <  $this->dynamicOffset[$iprevWindow] + 0x80)
		    {
                        $this->quoteSingleByte($ch);
                        $this->selectedWindow = $iprevWindow;
                        continue;
                    }
                }

                $this->sOut .= chr(($this->fUnicodeMode ? self::UC0 : self::SC0) + $this->selectedWindow);
                $this->fUnicodeMode = false;
            } else if (!$this->fUnicodeMode && $this->locateWindow($ch, self::$staticOffset)) {
                $this->quoteSingleByte($ch);
                $this->selectedWindow = $iprevWindow;
                continue;
            } else if ($this->positionWindow($ch) ) {
                $this->fUnicodeMode = false;
            } else {
                $this->iSCU = strlen($this->sOut);
                $this->sOut .= chr(self::SCU);
                continue;
            }
        }
    }

    private function locateWindow($ch, &$offsetTable){
        $iWin = $this->selectedWindow;
        if ($iWin != - 1 && $ch >= $offsetTable[$iWin] && $ch < $offsetTable[$iWin] + 0x80) return true;
        $offsetTableLen = count($offsetTable);
        for ($iWin = 0; $iWin < $offsetTableLen; $iWin++) {
            if ($ch >= $offsetTable[$iWin] && $ch < $offsetTable[$iWin] + 0x80) {
                $this->selectedWindow = $iWin;
                return true;
            }
        }
        return false;
    }

    private function isAsciiCrLfOrTab($ch){
        return ($ch >= 0x20 && $ch <= 0x7F) || $ch == 0x09 || $ch == 0x0A || $ch == 0x0D;  
    }

    private function outputSingleByteRun(){
        $iWin = $this->selectedWindow;
        while($this->iIn < $this->iInLen){
            $this->iOutLen = 0;
            $byte1 = 0;
            $byte2 = 0;
            $ch = $this->aIn[$this->iIn];
            $inlen = 1;
           
            if ( ($ch & 0xF800) == 0xD800 ){
                if ( ($ch & 0xFC00) == 0xDC00 ){
                    throw new SCSU_Exception('Byte #' . $this->iIn . '.', SCSU_Exception::INPUT_UNP_LOW);
                } else {
                    if ( $this->iIn >= $this->iInLen - 1) throw new SCSU_Exception('', SCSU_Exception::INPUT_ENDED);
                    $ch2 = $this->aIn[$this->iIn + 1];
                    if ( ($ch2 & 0xFC00) != 0xDC00 ) throw new SCSU_Exception('Byte #' . ($this->iIn + 1) . '.', SCSU_Exception::INPUT_UNP_HIGH);
                    $ch = (($ch - 0xD800)<<10 | ($ch2-0xDC00)) + 0x10000;
                    $inlen = 2;
                 }
            }
           
            if ($this->isAsciiCrLfOrTab($ch) || $ch == 0){
                $byte2 = $ch & 0x7F;
                $this->iOutLen = 1;
            } else if ($ch < 0x20) {
                $byte1 = self::SQ0;
                $byte2 = $ch & 255;
                $this->iOutLen = 2;
            } else if ($ch >= $this->dynamicOffset[$iWin] && $ch < $this->dynamicOffset[$iWin] + 0x80) {
                $ch -= $this->dynamicOffset[$iWin];
                $byte2 = $ch | 0x80;
                $this->iOutLen = 1;
            }

            switch($this->iOutLen) {
                default:
                    return $ch;
                case 2:
                    $this->sOut .= chr($byte1);
                case 1:
                    $this->sOut .= chr($byte2);
                    break;
            }
            $this->iIn += $inlen;
        }
        return 0;
    }

    private function quoteSingleByte($ch) {
        $iWin = $this->selectedWindow;
        $this->sOut .= chr(( self::SQ0 + $iWin ) & 255);
       
        if ($ch >= $this->dynamicOffset[$iWin] && $ch < $this->dynamicOffset[$iWin] + 0x80) {
            $ch -= $this->dynamicOffset[$iWin];
            $this->sOut .= chr($ch | 0x80);
        } else if ($ch >= self::$staticOffset[$iWin] && $ch < self::$staticOffset[$iWin] + 0x80) {
            $ch -= self::$staticOffset[$iWin];
            $this->sOut .= chr(( $ch ) & 255);
        } else throw new SCSU_Exception('ch = ' . $ch . ' not valid in quoteSingleByte.');

        $this->iIn++;
    }

    private function outputUnicodeRun() {
        $ch = 0;
        while($this->iIn < $this->iInLen) {
            $ch = $this->aIn[$this->iIn];
            $this->iOutLen = 2;
            if ($this->isCompressible($ch)) {
                if( $this->iIn < $this->iInLen - 1) {
                    $ch2 = $this->aIn[$this->iIn + 1];
                    if ($this->isCompressible($ch2)) break;
                }
                if ($ch >= 0xE000 && $ch <= 0xF2FF) $this->iOutLen = 3;
            }
            if ($this->iOutLen == 3) $this->sOut .= chr(self::UQU);
            $this->sOut .= chr($ch >> 8);
            $this->sOut .= chr($ch & 0xFF);
            $this->iIn++;
        }
        return $ch;
    }

    private function positionWindow($ch) {
        $iWin = $this->iNextWindow % 8;
        $iPosition = 0;

        if ($ch < 0x80) throw new SCSU_Exception('ch < 0x80.');
       
        for ($i = 0; $i < count(self::$fixedOffset); $i++) {
            if ($ch >= self::$fixedOffset[$i] && $ch < self::$fixedOffset[$i] + 0x80) {
                $iPosition = $i;
                break;
            }
        }

        if ($iPosition != 0) {
            $this->dynamicOffset[$iWin] = self::$fixedOffset[$iPosition];
            $iPosition += 0xF9;
        } else if ($ch < 0x3400) {
            $iPosition = $ch >> 7;
            $this->dynamicOffset[$iWin] = $ch & 0xFF80;
        } else if ($ch < 0xE000) {
            return false;
        } else if ($ch <= 0xFFFF) {
            $iPosition =  (($ch - self::gapOffset) >> 7);
            $this->dynamicOffset[$iWin] = $ch & 0xFF80;
        } else {
            $iPosition = ($ch - 0x10000) >> 7;
            $iPosition |= $iWin << 13;
            $this->dynamicOffset[$iWin] = $ch & 0x1FFF80;
        }

        if ( $iPosition < 0x100) {
            $this->sOut .= chr(($this->fUnicodeMode ? self::UD0 : self::SD0) + $iWin);
            $this->sOut .= chr($iPosition & 0xFF);
        } else if ( $iPosition >= 0x100 ) {
            $this->sOut .= chr($this->fUnicodeMode ? self::UDX : self::SDX);
            $this->sOut .= chr(($iPosition >> 8) & 0xFF);
            $this->sOut .= chr($iPosition & 0xFF);
        }

        $this->selectedWindow = $iWin;
        $this->iNextWindow++;
        return true;
    }

    // decompression related private methods

    private function defineWindow($iWindow, $bOffset) {
        $iOffset = ($bOffset < 0 ? $bOffset + 256 : $bOffset);
       
        if ($iOffset == 0) {
            throw new SCSU_Exception('', SCSU_Exception::INPUT_OFF_ZERO);
        } else if ($iOffset < self::gapThreshold) {
            $this->dynamicOffset[$iWindow] = $iOffset << 7;
        } else if ($iOffset < self::reservedStart) {
            $this->dynamicOffset[$iWindow] = ($iOffset << 7) + self::gapOffset;
        } else if ($iOffset < self::fixedThreshold) {
            throw new SCSU_Exception('Value = ' . $iOffset . '.', SCSU_Exception::INPUT_OFF_BAD);
        } else {
            $this->dynamicOffset[$iWindow] = self::$fixedOffset[$iOffset - self::fixedThreshold];
        }
       
        $this->selectedWindow = $iWindow;
    }

    private function defineExtendedWindow($chOffset) {
        $iWindow = $chOffset >> 13;
        $this->dynamicOffset[$iWindow] = (($chOffset & 0x1FFF) << 7) + (1 << 16);
        $this->selectedWindow = $iWindow;
    }
}

/**
 * Provides exceptions of SCSU errors
 *
 * See SCSU.php File description for full information
 *
 * @author 	Alexey A.Znaev <znaeff@mail.ru> <http://xbsoft.org>
 * @link 	http://xbsoft.org
 * @package 	SCSU
 * @version 	1.0
 * @since 	1.0
 */
class SCSU_Exception extends Exception {
    const INTERNAL 		= 0x00;
    const INPUT 		= 0x10;
    const INPUT_ENDED 		= 0x11;
    const INPUT_UNP_LOW 	= 0x12;
    const INPUT_UNP_HIGH 	= 0x13;
    const INPUT_OFF_ZERO 	= 0x14;
    const INPUT_OFF_BAD 	= 0x15;
    const INPUT_SRS 		= 0x16;
    const OUTPUT 		= 0x20;

    private static $Messages = array(
	self::INTERNAL 		=> 'Internal error.',
	self::INPUT 		=> 'Illegal input.',
	self::INPUT_ENDED 	=> 'Ended prematurely.',
	self::INPUT_UNP_LOW 	=> 'Unpaired low surrogate.',
	self::INPUT_UNP_HIGH 	=> 'Unpaired high surrogate.',
	self::INPUT_OFF_ZERO 	=> 'Zero offset.',
	self::INPUT_OFF_BAD 	=> 'Bad offset.',
	self::INPUT_SRS 	=> 'Srs byte found.',
	self::OUTPUT 		=> 'Bad output.',
    );

    public function __construct($message = '', $code = 0x00, Exception $previous = null) {
	$message_prefix = '';
	$code_class = $code & 0xF0;
	if(array_key_exists($code_class, self::$Messages)) $message_prefix = self::$Messages[$code_class];
	if(($code != $code_class) && array_key_exists($code, self::$Messages)) $message_prefix .= ' ' . self::$Messages[$code];
	if(!empty($message_prefix)) $message = $message_prefix . ' ' . $message;
        parent::__construct('SCSU: ' . $message, $code, $previous);
    }
}

?>