<?php
/**
* This file contains the TextTokenizer class.
*
* PHP Version 5.3
*
* @category Text
* @package TextTokenizer
* @author Gonzalo Chumillas <gonzalo@soloproyectos.com>
* @license https://raw.github.com/soloproyectos/core/master/LICENSE BSD 2-Clause License
* @link https://github.com/soloproyectos/core
*/
namespace com\soloproyectos\common\text\tokenizer;
/**
* Class TextTokenizer
* This class can parse and split a string into tokens. It can take a string and
* split it to retrieve smaller tokens one by one. The format of the tokens is
* defined by regular expressions passed to the class as parameters.
*
* @category Text
* @package TextTokenizer
* @author Gonzalo Chumillas <gonzalo@soloproyectos.com>
* @license https://raw.github.com/soloproyectos/core/master/LICENSE BSD 2-Clause License
* @link https://github.com/soloproyectos/core
*/
class TextTokenizer
{
/**
* Offset capture flag.
*
* This flag is similar to the PREG_OFFSET_CAPTURE flag, used in the preg_match
* function. When it is passed to the function, it returns also the offset
* position of the matched elements.
*/
const OFFSET_CAPTURE = 0x1;
/**
* Case sensitive flag.
*
* When this flag is passed to the function, it distinguishes between lowercase
* and uppercase characters.
*/
const CASE_SENSITIVE = 0x4;
/**
* Search anywhere flag.
*
* When this flag is passed to the function, it searches matches anywhere,
* starting from the current offset position.
*/
const SEARCH_ANYWHERE = 0x8;
/**
* This regular pattern describes a "token".
*
* <p>A token is one or more "word" characters or a single "non-word"
* character. For example:</p>
*
* <pre>
* hello_there125 -- this is a token, as it is a sequence of "word" chars.
* % -- this is a token, as it is a single "non-word" chatacter.
* %! -- this is NOT a token
* </pre>
*/
const TOKEN = "\w+|.";
/**
* This regular pattern describes an "identifier".
*
* <p>An identifier is an alphabetic character followed by alphanumeric
* characters. For example:</p>
*
* <pre>
* odyssey2001 -- is an identifier
* james_bond -- is an identifier
* 007bond -- is NOT an identifier
* </pre>
*/
const IDENTIFIER = "[a-z]\w*";
/**
* This regular pattern describes a number.
*/
const NUMBER = '[+-]?(0|[1-9][0-9]*)(\.[0-9]+)?([eE][+-]?[0-9]+)?';
/**
* This regular pattern describes a string.
*
* <p>You can use either single or double quotes delimiters. The following
* examples are strings:</p>
*
* <pre>
* 'hello there'
* 'hello \'there'
* "hello there"
* "hello \"there"
* </pre>
*/
const STRING = '(["\'])((?:\\\\\2|.)*?)\2';
/**
* Global flags.
* @var integer
*/
private $_flags;
/**
* The string to be parsed.
* @var string
*/
protected $string;
/**
* The current offset.
* @var integer
*/
protected $offset;
/**
* Constructor.
*
* The $flag argument admits the following values:
* TextTokenizer::OFFSET_CAPTURE, TextTokenizer::CASE_SENSITIVE and
* TextTokenizer::SEARCH_ANYWHERE.
*
* @param string $string The string to be parsed
* @param integer $flags Flags (default is 0)
*/
public function __construct($string, $flags = 0)
{
$this->string = $string;
$this->offset = 0;
$this->_flags = $flags;
}
/**
* Is the next token equal to a given string?
*
* This function returns false if the next token is not equal to a given string
* or an array with a single string.
*
* @param string $str A string
* @param integer $flags Flags (default is 0)
*
* @return false|array of a single string
*/
public function eq($str, $flags = 0)
{
$ret = false;
if (list($str) = $this->match(preg_quote($str, "/"), $matches, $flags)
) {
$ret = array($str);
}
return $ret;
}
/**
* Is the next token the in a given list?
*
* This function returns false if the next token is not in a given list
* or an array with a single string.
*
* @param array $items An array of strings
* @param integer $flags Flags (default is 0)
*
* @return false|array of a single string
*/
public function in($items, $flags = 0)
{
$ret = false;
// sorts the items in descending order according to their length
usort(
$items,
function ($item1, $item2) {
return strlen($item1) < strlen($item2);
}
);
foreach ($items as $item) {
if ($this->eq($item, $flags)) {
$ret = array($item);
break;
}
}
return $ret;
}
/**
* Is the next token a number?
*
* This function returns false if the next token is not a number or an array
* with a single string.
*
* @param integer $flags Flags (default is 0)
*
* @return false|array of a single string
*/
public function number($flags = 0)
{
$ret = false;
if ($number = $this->match(TextTokenizer::NUMBER, $matches, $flags)) {
$ret = $number;
}
return $ret;
}
/**
* Is the next token a string?
*
* This function returns false if the next token is not a string or an array
* with a single string.
*
* @param integer $flags Flags (default is 0)
*
* @return false|array of a single string
*/
public function str($flags = 0)
{
$ret = false;
if ($this->match(TextTokenizer::STRING, $matches, $flags)) {
$delimiter = $matches[2];
$str = $matches[3];
$str = str_replace("\\$delimiter", "$delimiter", $str);
$ret = array($str);
}
return $ret;
}
/**
* Gets the next token.
*
* <p>This function returns false if there are no more tokens or an array with a
* single string. For example:</p>
*
* <pre>// splits a string into tokens
* $t = new TextTokenizer("lorem ipsum; dolor sit amet.");
* while (list($token) = $t->token()) {
* echo "$token-";
* }
* </pre>
*
* @return false|array of a single string
*/
public function token()
{
$ret = false;
if (list($token) = $this->match(TextTokenizer::TOKEN)) {
$ret = array($token);
}
return $ret;
}
/**
* Is the next token an identifier?
*
* This function returns false if the next token is not an identifier or an
* array with a single string.
*
* @return false|array of a single string
*/
public function id()
{
$ret = false;
if (list($id) = $this->match(TextTokenizer::IDENTIFIER)) {
$ret = array($id);
}
return $ret;
}
/**
* Matches the string against a regex.
*
* <p>This function matches the string against a regular expressión. If they
* match, it advances the offset position and returns an array with a single
* string. Otherwise, it returns false. You can use regex without delimiters.
* Instead of using /^\s*(\w+)/, you can use simply '\w+'. For example:</p>
*
* <pre>// these two lines are identical
* if ($t->match("\w+")) doSomething();
* if ($t->match("/^\s*(\w+)/")) doSomething();
* </pre>
*
* <p>Example 1:</p>
*
* <pre>// splits a string into "words"
* $t = new TextTokenizer("Lorem ipsum dolor sit amet");
* while (list($token) = $t->match("\w+", $matches)) {
* echo "$token-";
* }
* </pre>
*
* <p>Example 2:</p>
*
* <pre>// captures the offset
* $t = new TextTokenizer("I am 105 years old");
* if ($t->match("/\d+/", $matches, TextTokenizer::OFFSET_CAPTURE)) {
* print_r($matches);
* }
* </pre>
*
* <p>Example 3:</p>
*
* <pre>// parses a basic SQL sentence
* $t = new TextTokenizer("Select Id, Name, Age From users Where Id = 101");
* if ($t->match("select")) {
* // columns
* $columns = array();
* while (list($column) = $t->match("\w+")) {
* array_push($columns, $column);
* if (!$t->match(",")) {
* break;
* }
* }
* // `from` clause
* if ($t->match("from\s+(\w+)", $matches)) {
* $tableName = $matches[1];
* echo "You want to get the columns " . implode(", ", $columns) .
* " from the table $tableName.";
* }
* }
* </pre>
*
* @param string $regexp Regular expression
* @param array $matches Matches (default is array(), passed by reference)
* @param integer $flags Flags (default is 0)
*
* @return false|array of a single string
*/
public function match($regexp, &$matches = array(), $flags = 0)
{
// we do not like empty strings
if (strlen($regexp) == 0) {
return false;
}
$ret = false;
$explicitRegexp = strlen($regexp) > 0 && $regexp[0] == "/";
$substr = substr($this->string, $this->offset);
if (!$explicitRegexp) {
$caseSensitive = TextTokenizer::CASE_SENSITIVE
& ($this->_flags | $flags);
$searchAnywhere = TextTokenizer::SEARCH_ANYWHERE
& ($this->_flags | $flags);
$modifiers = "us" . ($caseSensitive? "" : "i");
$regexp = $searchAnywhere
? "/($regexp)/$modifiers"
: "/^\s*($regexp)/$modifiers";
}
if (preg_match($regexp, $substr, $matches, PREG_OFFSET_CAPTURE)) {
$offsetCapture = TextTokenizer::OFFSET_CAPTURE
& ($this->_flags | $flags);
$str = $matches[0][0];
$offset = $matches[0][1] + strlen($str);
if ($offsetCapture) {
// fixes offsets
foreach ($matches as $i => $match) {
$matches[$i][1] += $this->offset;
}
} else {
// ignores offsets
foreach ($matches as $i => $match) {
$matches[$i] = $matches[$i][0];
}
}
if (!ctype_alnum($substr[$offset - 1])
|| $offset == strlen($substr)
|| !ctype_alnum($substr[$offset])
) {
$this->offset += $offset;
$ret = array(ltrim($str));
}
}
return $ret;
}
/**
* Gets the offset position.
*
* @return integer
*/
public function getOffset()
{
return $this->offset;
}
/**
* Sets the offset position.
*
* @param string $value A string value
*
* @return void
*/
public function setOffset($value)
{
$this->offset = $value;
}
/**
* Gets the target string.
*
* @return string
*/
public function getString()
{
return $this->string;
}
/**
* Resets the parser and start again.
*
* @return void
*/
public function reset()
{
$this->offset = 0;
}
/**
* Has the offset reached the end of the line?
*
* @return boolean
*/
public function end()
{
return $this->offset >= strlen(rtrim($this->string));
}
}
|