File: classes/parser/tokenizer.php

Recommend this page to a friend!
???
File:	`classes/parser/???`
Role:	Class source
Content type:	`text/plain`
Description:	Tokenizer class
Class:	PHP CSS Parser Get HTML document nodes matching a CSS selector
Author:	By Gonzalo Chumillas
Last change:
Date:	11 years ago
Size:	`9,911 bytes`
Download

<?php

/**

 * This file contains the Tokenizer class.

 * 

 * @author Gonzalo Chumillas <[email protected]>

 * @package parser

 */

 

/**

 * class Tokenizer

 * This class not only can split a string into smaller pieces called tokens,

 * but it can be used to parse a string on the fly.

 */

class Tokenizer {

    /**

     * This flag indicates that we want to retrieve the position of the matches.

     * This flag affects only to the 'match' function.

     */

    const OFFSET_CAPTURE = 0x1;

    

    /**

     * This flag indicates that we want to distinguish between uppercase and lowercase characters.

     */

    const CASE_SENSITIVE = 0x4;

    

    /**

     * Searches matches anywhere, starting from the offset position.

     */

    const SEARCH_ANYWHERE = 0x8;

    

    /*

     * This regular pattern describes a "token".

     * A token is one or more "word" characters or a single "non-word" character. For example:

     * 

     * hello_there125 -- this is a token because it is a sequence of "word" characters

     * % -- this is a token because it is a single "non-word" chatacter.

     * %! -- this is NOT a token

     */

    const TOKEN = "\w+|.";

    

    /**

     * This regular pattern describes an "identifier".

     * An identifier is an alphabetic character followed by alphanumeric characters. For example:

     * 

     * odyssey2001 -- is an identifier

     * james_bond  -- is an identifier

     * 007bond     -- is NOT an identifier because the first character is not alphabetic

     */

    const IDENTIFIER = "[a-z]\w*";

    

    /**

     * This regular pattern describes a floating point number.

     */

    const NUMBER = '[+-]?(0|[1-9][0-9]*)(\.[0-9]+)?([eE][+-]?[0-9]+)?';

    

    /**

     * This regular pattern describes a string.

     * You can use either single or double quotes delimiters. The following examples are strings:

     * 

     * 'hello there'

     * 'hello \'there'

     * "hello there"

     * "hello \"there"

     */

    const STRING = '(["\'])((?:\\\\\2|.)*?)\2';

    

    /**

     * Flags.

     * @var int

     */

    private $flags;

    

    /**

     * The string to be parsed.

     * @var string

     */

    protected $string;

    

    /**

     * The current offset.

     * @var int

     */

    protected $offset;

    

    /**

     * @param string $string The string to be parsed

     * @param int $flags = 0 This parameter can be Tokenizer::OFFSET_CAPTURE or Tokenizer::CASE_SENSITIVE

     */

    public function __construct($string, $flags = 0) {

        $this->string = $string;

        $this->offset = 0;

        $this->flags = $flags;

    }

    

    /**

     * Is the next equal to a given string?

     * When successful, this function returns an array with a single string. Otherwise, it returns FALSE.

     * @param string $str

     * @param int $flags = 0

     * @return string

     */

    public function eq($str, $flags = 0) {

        $ret = FALSE;

        

        if (list($str) = $this->match(preg_quote($str, "/"), $matches, $flags)) {

            $ret = array($str);

        }

        

        return $ret;

    }

    

    /**

     * Is the next in a given list?

     * When successful, this function returns an array with a single string. Otherwise, it returns FALSE.

     * @param array $items An array of strings

     * @param int $flags = 0

     * @return string|FALSE

     */

    public function in($items, $flags = 0) {

        $ret = FALSE;

        

        // sorts the items in descending order according to their length

        usort($items, function($item1, $item2) {

            return strlen($item1) < strlen($item2);

        });

        

        foreach ($items as $item) {

            if ($this->eq($item, $flags)) {

                $ret = array($item);

                break;

            }

        }

        

        return $ret;

    }

    

    /**

     * Is the next a number?

     * When successful, this function returns an array with a single string. Otherwise, it returns FALSE.

     * @param int $flags = 0

     * @return string|FALSE

     */

    public function number($flags = 0) {

        $ret = FALSE;

        

        if ($number = $this->match(Tokenizer::NUMBER, $matches, $flags)) {

            $ret = $number;

        }

        

        return $ret;

    }

    

    /**

     * Is the next a string?

     * When successful, this function returns an array with a single string. Otherwise, it returns FALSE.

     * @param int $flags = 0

     * @return string|FALSE

     */

    public function str($flags = 0) {

        $ret = FALSE;

        

        if ($this->match(Tokenizer::STRING, $matches, $flags)) {

            $last_item = end($matches);

            $delimiter = $matches[2];

            $str = $matches[3];

            $str = str_replace("\\$delimiter", "$delimiter", $str);

            $ret = array($str);

        }

        

        return $ret;

    }

    

    /**

     * Is the next a token?

     * When successful, this function returns an array with a single string. Otherwise, it returns FALSE.

     * Example:

     * 

     * <code>

     * // splits a string into tokens

     * $t = new Tokenizer("lorem ipsum; dolor sit amet.");

     * while (list($token) = $t->token()) {

     *     echo "$token-";

     * }

     * </code>

     * 

     * @return string|FALSE

     */

    public function token() {

        $ret = FALSE;

        

        if (list($token) = $this->match(Tokenizer::TOKEN)) {

            $ret = array($token);

        }

        

        return $ret;

    }

    

    /**

     * Is the next an identifier?

     * @return string|FALSE

     */

    public function id() {

        $ret = FALSE;

        

        if (list($id) = $this->match(Tokenizer::IDENTIFIER)) {

            $ret = array($id);

        }

        

        return $ret;

    }

    

    /**

     * Compares the string with a regular expression and advances the offset if they match.

     * When successful, this function returns an array with a single string. Otherwise, it returns FALSE.

     * 

     * You can use regular expression without delimiters. The advantages of using regular expression without

     * delimiters, is that you do not need to worry about ignoring the left spaces and start parsing from the

     * beginning. The backslash character is reserved for delimiting regular expressions. For example:

     * 

     * <code>

     * // these two lines are identical

     * $t->match("\w+");

     * $t->match("/^\s*(\w+)/");

     * </code>

     * 

     * More examples:

     * 

     * <code>

     * // splits a string into "words"

     * $t = new Tokenizer("Lorem ipsum dolor sit amet");

     * while (list($token) = $t->match("\w+", $matches)) {

     *     echo "$token-";

     * }

     * </code>

     * 

     * // captures the offset

     * <code>

     * $t = new Tokenizer("I am 105 years old");

     * if ($t->match("/\d+/", $matches, Tokenizer::OFFSET_CAPTURE)) {

     *     print_r($matches);

     * }

     * </code>

     * 

     * <code>

     * // parses a basic SQL sentence

     * $t = new Tokenizer("Select Id, Name, Age From users Where Id = 101");

     * if ($t->match("select")) {

     *     // columns

     *     $columns = array();

     *     while (list($column) = $t->match("\w+")) {

     *         array_push($columns, $column);

     *         if (!$t->match(",")) {

     *             break;

     *         }

     *     }

     *     // `from` clause

     *     if ($t->match("from\s+(\w+)", $matches)) {

     *         $table_name = $matches[1];

     *         echo "You want to get the columns " . implode(", ", $columns) . " from the table $table_name.";

     *     }

     * }

     * </code>

     * 

     * @param string $regexp

     * @param array &$matches

     * @param int $flags = 0

     * @return array|FALSE

     * </code>

     */

    public function match($regexp, &$matches = array(), $flags = 0) {

        $ret = FALSE;

        $explicit_regexp = strlen($regexp) > 0 && $regexp[0] == "/";

        $substr = substr($this->string, $this->offset);

        

        if (!$explicit_regexp) {

            $case_sensitive = Tokenizer::CASE_SENSITIVE & ($this->flags | $flags);

            $search_anywhere = Tokenizer::SEARCH_ANYWHERE & ($this->flags | $flags);

            $modifiers = "us" . ($case_sensitive? "" : "i");

            $regexp = $search_anywhere? "/($regexp)/$modifiers" : "/^\s*($regexp)/$modifiers";

        }

        

        if (preg_match($regexp, $substr, $matches, PREG_OFFSET_CAPTURE)) {

            $offset_capture = Tokenizer::OFFSET_CAPTURE & ($this->flags | $flags);

            $str = $matches[0][0];

            $offset = $matches[0][1] + strlen($str);

            

            if ($offset_capture) {

                // fixes offsets

                foreach ($matches as $i => $match) {

                    $matches[$i][1] += $this->offset;

                }

            } else {

                // ignores offsets

                foreach ($matches as $i => $match) {

                    $matches[$i] = $matches[$i][0];

                }

            }

            

            if (!ctype_alnum($substr[$offset - 1]) || $offset == strlen($substr) || !ctype_alnum($substr[$offset])){

                $this->offset += $offset;

                $ret = array(ltrim($str));

            }

        }

        

        return $ret;

    }

    

    /**

     * Gets the offset position.

     * @return int

     */

    public function getOffset() {

        return $this->offset;

    }

    

    /**

     * Sets the offset position.

     * @param string $value

     */

    public function setOffset($value) {

        $this->offset = $value;

    }

    

    /**

     * Gets the string.

     * @return string

     */

    public function getString() {

        return $this->string;

    }

    

    /**

     * Has the offset reached the end of the line?

     * @return boolean

     */

    public function end() {

        return $this->offset >= strlen(rtrim($this->string));

    }

}
About us
Advertise on this site
For more information send a message to info at phpclasses dot org.
File: classes/parser/tokenizer.php

Contents