PHP Classes

File: src/python/Regex.py

Recommend this page to a friend!
  Classes of Nikos M.   PHP Regex Analyzer and Composer   src/python/Regex.py   Download  
File: src/python/Regex.py
Role: Auxiliary data
Content type: text/plain
Description: Auxiliary data
Class: PHP Regex Analyzer and Composer
Analyze and compose regular expressions
Author: By
Last change:
Date: 3 years ago
Size: 56,472 bytes
 

Contents

Class file image Download
# -*- coding: UTF-8 -*- ## # # Regex # @version: 1.1.0 # # A simple & generic Regular Expression Analyzer & Composer for PHP, Python, Node.js / Browser / XPCOM Javascript # https://github.com/foo123/RegexAnalyzer # ## import random, math, re, copy def is_array(x): return isinstance(x,(list,tuple)) def is_string(x): return isinstance(x,str) TYPE_REGEXP = None def is_regexp(x): global TYPE_REGEXP if TYPE_REGEXP is None: TYPE_REGEXP = type(re.compile(r'[a-z]')) return isinstance(x,TYPE_REGEXP) def array( x ): return x if is_array(x) else [x] def is_nan(v): return math.isnan(v) def rnd(a, b): return random.randint(a, b) def esc_re( s, esc, chargroup=False ): es = '' l = len(s) i=0 #escaped_re = /([.*+?^${}()|[\]\/\\\-])/g if chargroup: while i < l: c = s[i] i += 1 es += (esc if ('-' == c) or ('^' == c) or ('$' == c) or ('|' == c) or ('{' == c) or ('}' == c) or ('(' == c) or (')' == c) or ('[' == c) or (']' == c) or ('/' == c) or (esc == c) else '') + c else: while i < l: c = s[i] i += 1 es += (esc if ('?' == c) or ('*' == c) or ('+' == c) or ('.' == c) or ('^' == c) or ('$' == c) or ('|' == c) or ('{' == c) or ('}' == c) or ('(' == c) or (')' == c) or ('[' == c) or (']' == c) or ('/' == c) or (esc == c) else '') + c return es def pad( s, n, z='0' ): ps = str(s) while len(ps) < n: ps = z + ps return ps def char_code( c ): return ord(c[0]) def char_code_range( s ): return [ord(s[0]), ord(s[-1])] def concat( p1, p2=None ): if p2: if is_array(p2): for p in p2: p1[ p ] = 1 else: for p in p2: p1[ p ] = 1; return p1 def character_range( first=None, last=None ): if first and is_array(first): last = first[1] first = first[0] start = ord(first[0]) end = ord(last[0]) if end == start: return [ chr( start ) ] chars = [] for ch in range(start, end+1, 1): chars.append(chr( ch )) return chars BSPACES = ["\r","\n"] SPACES = [" ","\t","\v"] PUNCTS = ["~","!","@","#","$","%","^","&","*","(",")","-","+","=","[","]","{","}","\\","|",";",":",",",".","/","<",">","?"] DIGITS = ["0","1","2","3","4","5","6","7","8","9"] DIGITS_RANGE = char_code_range(DIGITS) HEXDIGITS_RANGES = [DIGITS_RANGE, [char_code("a"), char_code("f")], [char_code("A"), char_code("F")]] ALPHAS = ["_"]+character_range("a", "z")+character_range("A", "Z") ALL = SPACES+PUNCTS+DIGITS+ALPHAS T_SEQUENCE = 1 T_ALTERNATION = 2 T_GROUP = 4 T_CHARGROUP = 8 T_QUANTIFIER = 16 T_UNICODECHAR = 32 T_HEXCHAR = 64 T_SPECIAL = 128 T_CHARS = 256 T_CHARRANGE = 512 T_STRING = 1024 T_COMMENT = 2048 ESC = '\\' class _G(): specialChars = { "." : "MatchAnyChar", "|" : "MatchEither", "?" : "MatchZeroOrOne", "*" : "MatchZeroOrMore", "+" : "MatchOneOrMore", "^" : "MatchStart", "$" : "MatchEnd", "{" : "StartRepeats", "}" : "EndRepeats", "(" : "StartGroup", ")" : "EndGroup", "[" : "StartCharGroup", "]" : "EndCharGroup" } specialCharsEscaped = { "\\" : "ESC", "/" : "/", "0" : "NULChar", "f" : "FormFeed", "n" : "LineFeed", "r" : "CarriageReturn", "t" : "HorizontalTab", "v" : "VerticalTab", "b" : "MatchWordBoundary", "B" : "MatchNonWordBoundary", "s" : "MatchSpaceChar", "S" : "MatchNonSpaceChar", "w" : "MatchWordChar", "W" : "MatchNonWordChar", "d" : "MatchDigitChar", "D" : "MatchNonDigitChar" } BSPACES = BSPACES SPACES = SPACES PUNCTS = PUNCTS DIGITS = DIGITS DIGITS_RANGE = DIGITS_RANGE HEXDIGITS_RANGES = HEXDIGITS_RANGES ALPHAS = ALPHAS ALL = ALL class RE_OBJ(): def __init__(self, regex): self.re = regex self.len = len(regex) self.pos = 0 self.index = 0 self.groupIndex = 0 self.group = {} self.inGroup = 0 def dispose(self): self.re = None self.len = None self.pos = None self.index = None self.groupIndex = None self.group = None self.inGroup = None def __del__(self): self.dispose() class Node(): def toObjectStatic( v ): if isinstance(v,Node): return { 'type': v.typeName, 'value': Node.toObjectStatic(v.val), 'flags': v.flags } if v.flags and len(v.flags) else { 'type': v.typeName, 'value': Node.toObjectStatic(v.val) } elif is_array(v): return list(map(Node.toObjectStatic, v)) return v def __init__(self, type, value, flags=None): self.type = type self.val = value self.flags = flags if flags else {} if T_SEQUENCE == type: self.typeName = "Sequence" elif T_ALTERNATION == type: self.typeName = "Alternation" elif T_GROUP == type: self.typeName = "Group" elif T_CHARGROUP == type: self.typeName = "CharacterGroup" elif T_CHARS == type: self.typeName = "Characters" elif T_CHARRANGE == type: self.typeName = "CharacterRange" elif T_STRING == type: self.typeName = "String" elif T_QUANTIFIER == type: self.typeName = "Quantifier" elif T_UNICODECHAR == type: self.typeName = "UnicodeChar" elif T_HEXCHAR == type: self.typeName = "HexChar" elif T_SPECIAL == type: self.typeName = "Special" elif T_COMMENT == type: self.typeName = "Comment" else: self.typeName = "unspecified" def __del__(self): self.dispose() def dispose(self): self.val = None self.flags = None self.type = None self.typeName = None return self def toObject(self): return Node.toObjectStatic(self) def match_chars( CHARS, s, pos=0, minlen=1, maxlen=100 ): #if maxlen < 0: maxlen = float("inf") lp = pos l = 0 sl = len(s) while (lp < sl) and (l <= maxlen) and (s[lp] in CHARS): lp+=1 l+=1 return l if l >= minlen else False def match_char_range( RANGE, s, pos=0, minlen=1, maxlen=100 ): #if maxlen < 0: maxlen = float("inf") lp = pos l = 0 sl = len(s) found = True while (lp < sl) and (l <= maxlen) and found: ch = ord(s[lp]) if ch >= RANGE[0] and ch <= RANGE[1]: lp+=1 l+=1 else: found = False return l if l >= minlen else False def match_char_ranges( RANGES, s, pos=0, minlen=1, maxlen=100 ): #if maxlen < 0: maxlen = float("inf") lp = pos l = 0 sl = len(s) found = True while (lp < sl) and (l <= maxlen) and found: ch = ord(s[lp]) found = False for RANGE in RANGES: if ch >= RANGE[0] and ch <= RANGE[1]: lp+=1 l+=1 found = True break return l if l >= minlen else False def punct( ): global _G return _G.PUNCTS[rnd(0, len(_G.PUNCTS)-1)] def space( positive=True ): global _G if positive is not False: return _G.SPACES[rnd(0, len(_G.SPACES)-1)] else: return [punct(),digit(),alpha()][rnd(0,2)] def digit( positive=True ): global _G if positive is not False: return _G.DIGITS[rnd(0, len(_G.DIGITS)-1)] else: return [punct(),space(),alpha()][rnd(0,2)] def alpha( positive=True ): global _G if positive is not False: return _G.ALPHAS[rnd(0, len(_G.ALPHAS)-1)] else: return [punct(),space(),digit()][rnd(0,2)] def word( positive=True ): global _G if positive is not False: s = _G.ALPHAS+_G.DIGITS return s[rnd(0, len(s)-1)] else: return [punct(),space()][rnd(0,1)] def any( ): global _G return _G.ALL[rnd(0, len(_G.ALL)-1)] def character( chars, positive=True ): global _G if positive is not False: l = len(chars) return chars[rnd(0, l-1)] if l else '' else: choices = [] for choice in _G.ALL: if choice not in chars: choices.append(choice) l = len(choices) return choices[rnd(0, l-1)] if l else '' def random_upper_or_lower( c ): return str(c).lower() if rnd(0,1) else str(c).upper() def case_insensitive( chars, asArray=False ): if asArray: if isinstance(chars, str): chars = chars.split('') chars = list(map( random_upper_or_lower, chars )) return chars else: return random_upper_or_lower( chars ) def walk( ret, node, state ): if (node is None) or (not state): return ret type = node.type if isinstance(node, Node) else None # walk the tree if type is None: # custom, let reduce handle it ret = state['reduce']( ret, node, state ) elif state['IGNORE'] & type: # nothing pass elif state['MAP'] & type: r = state['map']( ret, node, state ) if ('ret' in state) and (state['ret'] is not None): ret = state['reduce']( ret, node, state ) state['ret'] = None elif r is not None: r = array(r) for ri in r: state['node'] = node ret = walk( ret, ri, state ) if ('stop' in state) and state['stop']: state['stop'] = None return ret elif state['REDUCE'] & type: ret = state['reduce']( ret, node, state ) state['node'] = None return ret def map_src( ret, node, state ): type = node.type if T_ALTERNATION == type: r = [] l = len(node.val)-1 for i in range(l): r.append(node.val[i]) r.append('|') r.append(node.val[l]) return r elif T_CHARGROUP == type: return ['['+('^' if 'NegativeMatch' in node.flags else '')] + array(node.val) + [']'] elif T_QUANTIFIER == type: q = '' if 'MatchZeroOrOne' in node.flags: q = '?' elif 'MatchZeroOrMore' in node.flags: q = '*' elif 'MatchOneOrMore' in node.flags: q = '+' else: q = ('{'+str(node.flags['min'])+'}') if node.flags['min'] == node.flags['max'] else ('{'+str(node.flags['min'])+','+('' if -1==node.flags['max'] else str(node.flags['max']))+'}') if (node.flags['min'] != node.flags['max']) and not node.flags['isGreedy']: q += '?' return array(node.val) + [q] elif T_GROUP == type: g = None if 'NotCaptured' in node.flags: g = ['(?:'] + array(node.val) + [')'] elif 'LookAhead' in node.flags: g = ['(?='] + array(node.val) + [')'] elif 'NegativeLookAhead' in node.flags: g = ['(?!'] + array(node.val) + [')'] elif 'LookBehind' in node.flags: g = ['(?<='] + array(node.val) + [')'] elif 'NegativeLookBehind' in node.flags: g = ['(?<!'] + array(node.val) + [')'] else: g = ['('] + array(node.val) + [')'] if 'GroupIndex' in node.flags: ret['group'][str(node.flags['GroupIndex'])] = node.flags['GroupIndex'] if 'GroupName' in node.flags: ret['group'][node.flags['GroupName']] = node.flags['GroupIndex'] return g return node.val def map_any( ret, node, state ): type = node.type if (T_ALTERNATION == type) or (T_CHARGROUP == type): return node.val[rnd(0, len(node.val)-1)] if len(node.val) else None elif T_QUANTIFIER == type: if len(ret) >= state['maxLength']: numrepeats = node.flags['min'] else: mmin = node.flags['min'] mmax = (mmin+1+2*state['maxLength']) if -1==node.flags['max'] else node.flags['max'] numrepeats = rnd(mmin, mmax) return [node.val] * numrepeats if numrepeats else None elif (T_GROUP == type) and ('GroupIndex' in node.flags): sample = walk('', node.val, state) state['group'][str(node.flags['GroupIndex'])] = sample state['ret'] = sample return None else: return node.val def map_min( ret, node, state ): type = node.type if T_ALTERNATION == type: l = len(node.val) min = walk(0, node.val[0], state) if l else 0 i = 1 while i < l: cur = walk(0, node.val[i], state) if cur < min: min = cur i += 1 if l: state['ret'] = min return None elif T_CHARGROUP == type: return node.val[0] if len(node.val) else None elif T_QUANTIFIER == type: if 0==node.flags['min']: return None return [node.val] * node.flags['min'] elif (T_GROUP == type) and ('GroupIndex' in node.flags): min = walk(0, node.val, state) state['group'][str(node.flags['GroupIndex'])] = min state['ret'] = min return None else: return node.val def map_max( ret, node, state ): type = node.type if T_ALTERNATION == type: l = len(node.val) max = walk(0, node.val[0], state) if l else 0 if -1 != max: i = 1 while i<l: cur = walk(0, node.val[i], state) if -1 == cur: max = -1 break elif cur > max: max = cur i += 1 if l: state['ret'] = max return None elif T_CHARGROUP == type: return node.val[0] if len(node.val) else None elif T_QUANTIFIER == type: max = walk(0, node.val, state) if -1 == max: state['ret'] = -1 elif 0 < max: if -1 == node.flags['max']: state['ret'] = -1 elif 0 < node.flags['max']: state['ret'] = node.flags['max']*max else: state['ret'] = max return None elif (T_GROUP == type) and ('GroupIndex' in node.flags): max = walk(0, node.val, state) state['group'][str(node.flags['GroupIndex'])] = max state['ret'] = max return None else: return node.val def map_1st( ret, node, state ): type = node.type if T_SEQUENCE == type: seq = [] for n in node.val: seq.append( n ) if (T_QUANTIFIER == n.type) and (0 == n.flags['min']): continue elif (T_SPECIAL == n.type) and (('MatchStart' in n.flags) or ('MatchEnd' in n.flags)): continue break return seq if len(seq) else None else: return node.val def reduce_len( ret, node, state ): if ('ret' in state) and state['ret'] is not None: if -1 == state['ret']: ret = -1 else: ret += state['ret'] return ret if -1 == ret: return ret if isinstance(node, int): ret += node return ret if (T_SPECIAL == node.type) and ('MatchEnd' in node.flags): state['stop'] = 1 return ret type = node.type; if (T_CHARS == type) or (T_CHARRANGE == type) or (T_UNICODECHAR == type) or (T_HEXCHAR == type) or ((T_SPECIAL == type) and ('MatchStart' not in node.flags) and ('MatchEnd' not in node.flags)): ret += (state['group'][node.val] if node.val in state['group'] else 0) if 'BackReference' in node.flags else 1 elif T_STRING == type: ret += len(node.val) return ret def reduce_str( ret, node, state ): if ('ret' in state) and state['ret'] is not None: ret += str(state['ret']) return ret if is_string(node): ret += node return ret if (T_SPECIAL == node.type) and ('MatchEnd' in node.flags): state['stop'] = 1 return ret type = node.type sample = None if T_CHARS == type: sample = node.val elif T_CHARRANGE == type: range = [node.val[0],node.val[1]] if isinstance(range[0],Node) and (T_UNICODECHAR == range[0].type or T_HEXCHAR == range[0].type): range[0] = range[0].flags['Char'] if isinstance(range[1],Node) and (T_UNICODECHAR == range[1].type or T_HEXCHAR == range[1].type): range[1] = range[1].flags['Char'] sample = character_range(range) elif (T_UNICODECHAR == type) or (T_HEXCHAR == type): sample = [node.flags['Char']] elif (T_SPECIAL == type) and ('MatchStart' not in node.flags) and ('MatchEnd' not in node.flags): part = node.val if 'BackReference' in node.flags: ret += state['group'][part] if part in state['group'] else '' return ret elif 'D' == part: sample = [digit( False )] elif 'W' == part: sample = [word( False )] elif 'S' == part: sample = [space( False )] elif 'd' == part: sample = [digit( )] elif 'w' == part: sample = [word( )] elif 's' == part: sample = [space( )] elif ('.' == part) and ('MatchAnyChar' in node.flags): sample = [any( )] else: sample = [ESC + part] elif T_STRING == type: sample = node.val if sample is not None: ret += (case_insensitive(sample) if state['isCaseInsensitive'] else sample) if T_STRING == type else character(case_insensitive(sample, True) if state['isCaseInsensitive'] else sample, ('node' not in state) or ('NegativeMatch' not in state['node'].flags)) return ret def reduce_src( ret, node, state ): if ('ret' in state) and state['ret'] is not None: if 'src' in statep['ret']: ret['src'] += state['ret']['src'] if 'group' in state['ret']: ret['group'].update(state['ret']['group']) return ret if is_string(node): ret['src'] += node return ret type = node.type if T_CHARS == type: ret['src'] += esc_re(''.join(node.val), ESC, 1) if state['escaped'] else ''.join(node.val) elif T_CHARRANGE == type: range = [node.val[0],node.val[1]] if state['escaped']: if isinstance(range[0],Node) and T_UNICODECHAR == range[0].type: range[0] = ESC+'u'+pad(range[0].flags['Code'],4) elif isinstance(range[0],Node) and T_HEXCHAR == range[0].type: range[0] = ESC+'x'+pad(range[0].flags['Code'],2) else: range[0] = esc_re(range[0], ESC, 1) if isinstance(range[1],Node) and T_UNICODECHAR == range[1].type: range[1] = ESC+'u'+pad(range[1].flags['Code'],4) elif isinstance(range[1],Node) and T_HEXCHAR == range[1].type: range[1] = ESC+'x'+pad(range[1].flags['Code'],2) else: range[1] = esc_re(range[1], ESC, 1) else: if isinstance(range[0],Node) and (T_UNICODECHAR == range[0].type or T_HEXCHAR == range[0].type): range[0] = range[0].flags['Char'] if isinstance(range[0],Node) and (T_UNICODECHAR == range[1].type or T_HEXCHAR == range[1].type): range[1] = range[1].flags['Char'] ret['src'] += range[0]+'-'+range[1] elif T_UNICODECHAR == type: ret['src'] += ESC+'u'+pad(node.flags['Code'],4) if state['escaped'] else node.flags['Char'] elif T_HEXCHAR == type: ret['src'] += ESC+'x'+pad(node.flags['Code'],2) if state['escaped'] else node.flags['Char'] elif T_SPECIAL == type: if 'BackReference' in node.flags: ret['src'] += ESC+node.val else: ret['src'] += (''+node.val) if ('MatchAnyChar' in node.flags) or ('MatchStart' in node.flags) or ('MatchEnd' in node.flags) else (ESC+node.val) elif T_STRING == type: ret['src'] += esc_re(node.val, ESC) if state['escaped'] else node.val return ret def reduce_peek( ret, node, state ): if ('ret' in state) and state['ret'] is not None: ret['positive'] = concat( ret['positive'], state['ret']['positive'] ) ret['negative'] = concat( ret['negative'], state['ret']['negative'] ) return ret if (T_SPECIAL == node.type) and ('MatchEnd' not in node.flags): state['stop'] = 1 return ret type = node.type inCharGroup = ('node' in state) and (T_CHARGROUP == state['node'].type) inNegativeCharGroup = inCharGroup and ('NotMatch' in state['node'].flags) peek = "negative" if inNegativeCharGroup else "positive" if T_CHARS == type: ret[peek] = concat( ret[peek], node.val ) elif T_CHARRANGE == type: range = [node.val[0],node.val[1]] if isinstance(range[0],Node) and (T_UNICODECHAR == range[0].type or T_HEXCHAR == range[0].type): range[0] = range[0].flags['Char'] if isinstance(range[1],Node) and (T_UNICODECHAR == range[1].type or T_HEXCHAR == range[1].type): range[1] = range[1].flags['Char'] ret[peek] = concat( ret[peek], character_range(range) ) elif (T_UNICODECHAR == type) or (T_HEXCHAR == type): ret[peek][node.flags['Char']] = 1 elif (T_SPECIAL == type) and ('BackReference' not in node.flags) and ('MatchStart' not in node.flags) and ('MatchEnd' not in node.flags): part = node.val if 'D' == part: ret["positive" if inNegativeCharGroup else "negative"][ '\\d' ] = 1 elif 'W' == part: ret["positive" if inNegativeCharGroup else "negative"][ '\\w' ] = 1 elif 'S' == part: ret["positive" if inNegativeCharGroup else "negative"][ '\\s' ] = 1 elif 'B' == part: ret["positive" if inNegativeCharGroup else "negative"][ '\\b' ] = 1 else: ret[peek][ESC + part] = 1 elif T_STRING == type: ret["positive"][node.val[0]] = 1 return ret def match_hex( s ): global _G m = False if (len(s) > 2) and ('x' == s[0]): if match_char_ranges(_G.HEXDIGITS_RANGES, s, 1, 2, 2): m=s[0:3] return [m, m[1:]] return False def match_unicode( s ): global _G m = False if (len(s) > 4) and ('u' == s[0]): if match_char_ranges(_G.HEXDIGITS_RANGES, s, 1, 4, 4): m=s[0:5] return [m, m[1:]] return False def match_repeats( s ): global _G pos = 0 m = False hasComma = False sl = len(s); if (sl > 2) and ('{' == s[pos]): m = ['', '', None] pos+=1 l=match_chars(_G.SPACES, s, pos) if l: pos += l l=match_char_range(_G.DIGITS_RANGE, s, pos) if l: m[1] = s[pos:pos+l] pos += l else: return False l=match_chars(_G.SPACES, s, pos) if l: pos += l if (pos<sl) and (',' == s[pos]): pos += 1 hasComma = True l=match_chars(_G.SPACES, s, pos) if l: pos += l l=match_char_range(_G.DIGITS_RANGE, s, pos) if l: m[2] = s[pos:pos+l] pos += l l=match_chars(_G.SPACES, s, pos) if l: pos += l if (pos<sl) and ('}' == s[pos]): pos+=1 m[0] = s[0:pos] if not hasComma: m[2] = m[1] return m else: return False return False def chargroup( re_obj ): global _G sequence = [] allchars = [] chars = [] flags = {} isRange = False escaped = False ch = '' if '^' == re_obj.re[re_obj.pos]: flags[ "NegativeMatch" ] = 1 re_obj.pos+=1 lre = re_obj.len while re_obj.pos < lre: isUnicode = False isHex = False prevch = ch ch = re_obj.re[re_obj.pos] re_obj.pos+=1 escaped = True if ESC == ch else False if escaped: ch = re_obj.re[re_obj.pos] re_obj.pos+=1 if escaped: # unicode character if 'u' == ch: m = match_unicode( re_obj.re[re_obj.pos-1:] ) re_obj.pos += len(m[0])-1 ch = Node(T_UNICODECHAR, m[0], {"Char": chr(int(m[1], 16)), "Code": m[1]}) isUnicode = True isHex = False # hex character elif 'x' == ch: m = match_hex( re_obj.re[re_obj.pos-1:] ) re_obj.pos += len(m[0])-1 ch = Node(T_HEXCHAR, m[0], {"Char": chr(int(m[1], 16)), "Code": m[1]}) isUnicode = True isHex = True if isRange: if len(chars): allchars = allchars + chars chars = [] range[1] = ch isRange = False sequence.append( Node(T_CHARRANGE, range) ) else: if escaped: if isUnicode: if len(chars): allchars = allchars + chars chars = [] sequence.append( ch ) elif (ch in _G.specialCharsEscaped) and ('/' != ch): if len(chars): allchars = allchars + chars chars = [] flag = {} flag[ _G.specialCharsEscaped[ch] ] = 1 sequence.append( Node(T_SPECIAL, ch, flag) ) else: chars.append( ch ) else: # end of char group if ']' == ch: if len(chars): allchars = allchars + chars chars = [] # map all chars into one node if len(allchars): sequence.append( Node(T_CHARS, allchars) ) return Node(T_CHARGROUP, sequence, flags) elif '-' == ch: range = [prevch, ''] if isinstance(prevch,Node): sequence.pop() else: chars.pop() isRange = True else: chars.append( ch ) if len(chars): allchars = allchars + chars chars = [] # map all chars into one node if len(allchars): sequence.append( Node(T_CHARS, allchars) ) return Node(T_CHARGROUP, sequence, flags) def analyze_re( re_obj ): global _G word = '' wordlen = 0 alternation = [] sequence = [] flags = {} escaped = False ch = '' if re_obj.inGroup > 0: pre = re_obj.re[re_obj.pos:re_obj.pos+2] pre3 = re_obj.re[re_obj.pos:re_obj.pos+3] captured = 1 if "?P=" == pre3: flags[ "BackReference" ] = 1 flags[ "GroupName" ] = '' re_obj.pos += 3 lre = re_obj.len while re_obj.pos < lre: ch = re_obj.re[ re_obj.pos ] re_obj.pos += 1 if ")" == ch: break flags[ "GroupName" ] += ch flags[ "GroupIndex" ] = re_obj.group[flags[ "GroupName" ]] if flags[ "GroupName" ] in re_obj.group else None return Node(T_SPECIAL, str(flags[ "GroupIndex" ]), flags) elif "?#" == pre: re_obj.pos += 2 word = '' lre = re_obj.len while re_obj.pos < lre: ch = re_obj.re[ re_obj.pos ] re_obj.pos += 1 if ")" == ch: break word += ch return Node(T_COMMENT, word) elif "?:" == pre: flags[ "NotCaptured" ] = 1 re_obj.pos += 2 captured = 0 elif "?=" == pre: flags[ "LookAhead" ] = 1 re_obj.pos += 2 captured = 0 elif "?!" == pre: flags[ "NegativeLookAhead" ] = 1 re_obj.pos += 2 captured = 0 elif "?<=" == pre3: flags[ "LookBehind" ] = 1 re_obj.pos += 3 captured = 0 elif "?<!" == pre3: flags[ "NegativeLookBehind" ] = 1 re_obj.pos += 3 captured = 0 elif ("?<" == pre) or ("?P<" == pre3): flags[ "NamedGroup" ] = 1 flags[ "GroupName" ] = '' re_obj.pos += 2 if "?<" == pre else 3 lre = re_obj.len while re_obj.pos < lre: ch = re_obj.re[ re_obj.pos ] re_obj.pos += 1 if ">" == ch: break flags[ "GroupName" ] += ch re_obj.index+=1 if captured: re_obj.groupIndex+=1 flags[ "GroupIndex" ] = re_obj.groupIndex re_obj.group[str(flags[ "GroupIndex" ])] = flags[ "GroupIndex" ] if "GroupName" in flags: re_obj.group[flags[ "GroupName" ]] = flags[ "GroupIndex" ] lre = re_obj.len while re_obj.pos < lre: ch = re_obj.re[re_obj.pos] re_obj.pos+=1 # \\abc escaped = True if ESC == ch else False if escaped: ch = re_obj.re[re_obj.pos] re_obj.pos+=1 if escaped: # unicode character if 'u' == ch: if wordlen: sequence.append( Node(T_STRING, word) ) word = '' wordlen = 0 m = match_unicode( re_obj.re[re_obj.pos-1:] ) re_obj.pos += len(m[0])-1 sequence.append( Node(T_UNICODECHAR, m[0], {"Char": chr(int(m[1], 16)), "Code": m[1]}) ) # hex character elif 'x' == ch: if wordlen: sequence.append( Node(T_STRING, word) ) word = '' wordlen = 0 m = match_hex( re_obj.re[re_obj.pos-1:] ) re_obj.pos += len(m[0])-1 sequence.append( Node(T_HEXCHAR, m[0], {"Char": chr(int(m[1], 16)), "Code": m[1]}) ) elif (ch in _G.specialCharsEscaped) and ('/' != ch): if wordlen: sequence.append( Node(T_STRING, word) ) word = '' wordlen = 0 flag = {} flag[ _G.specialCharsEscaped[ch] ] = 1 sequence.append( Node(T_SPECIAL, ch, flag) ) elif ('1' <= ch) and ('9' >= ch): if wordlen: sequence.append( Node(T_STRING, word) ) word = '' wordlen = 0 word = ch while re_obj.pos < lre: ch = re_obj.re[ re_obj.pos ] if ('0' <= ch) and ('9' >= ch): word += ch re_obj.pos += 1 else: break flag = {} flag[ 'BackReference' ] = 1 flag[ 'GroupIndex' ] = int(word, 10) sequence.append( Node(T_SPECIAL, word, flag) ) word = '' else: word += ch wordlen += 1 else: # group end if (re_obj.inGroup > 0) and (')' == ch): if wordlen: sequence.append( Node(T_STRING, word) ) word = '' wordlen = 0 if len(alternation): alternation.append( Node(T_SEQUENCE, sequence) ) sequence = [] flag = {} flag[ _G.specialChars['|'] ] = 1 return Node(T_GROUP, Node(T_ALTERNATION, alternation, flag), flags) else: return Node(T_GROUP, Node(T_SEQUENCE, sequence), flags) # parse alternation elif '|' == ch: if wordlen: sequence.append( Node(T_STRING, word) ) word = '' wordlen = 0 alternation.append( Node(T_SEQUENCE, sequence) ) sequence = [] # parse character group elif '[' == ch: if wordlen: sequence.append( Node(T_STRING, word) ) word = '' wordlen = 0 sequence.append( chargroup( re_obj ) ) # parse sub-group elif '(' == ch: if wordlen: sequence.append( Node(T_STRING, word) ) word = '' wordlen = 0 re_obj.inGroup+=1 sequence.append( analyze_re( re_obj ) ) re_obj.inGroup-=1 # parse num repeats elif '{' == ch: if wordlen: sequence.append( Node(T_STRING, word) ) word = '' wordlen = 0 m = match_repeats( re_obj.re[re_obj.pos-1:] ) re_obj.pos += len(m[0])-1 flag = { 'val': m[0], "MatchMinimum": m[1], "MatchMaximum": m[2] if m[2] else "unlimited", 'min': int(m[1],10), 'max': int(m[2],10) if m[2] else -1} flag[ _G.specialChars[ch] ] = 1 if (re_obj.pos<lre) and ('?' == re_obj.re[re_obj.pos]): flag[ "isGreedy" ] = 0 re_obj.pos+=1 else: flag[ "isGreedy" ] = 1 prev = sequence.pop() if T_STRING == prev.type and len(prev.val) > 1: sequence.append( Node(T_STRING, prev.val[0:-1]) ) prev.val = prev.val[-1] sequence.append( Node(T_QUANTIFIER, prev, flag) ) # quantifiers elif ('*' == ch) or ('+' == ch) or ('?' == ch): if wordlen: sequence.append( Node(T_STRING, word) ) word = '' wordlen = 0 flag = {} flag[ _G.specialChars[ch] ] = 1 flag['min'] = 1 if '+' == ch else 0 flag['max'] = 1 if '?' == ch else -1 if (re_obj.pos<lre) and ('?' == re_obj.re[re_obj.pos]): flag[ "isGreedy" ] = 0 re_obj.pos+=1 else: flag[ "isGreedy" ] = 1 prev = sequence.pop() if T_STRING == prev.type and len(prev.val) > 1: sequence.append( Node(T_STRING, prev.val[0:-1]) ) prev.val = prev.val[-1] sequence.append( Node(T_QUANTIFIER, prev, flag) ) # special characters like ^, $, ., etc.. elif ch in _G.specialChars: if wordlen: sequence.append( Node(T_STRING, word) ) word = '' wordlen = 0 flag = {} flag[ _G.specialChars[ch] ] = 1 sequence.append( Node(T_SPECIAL, ch, flag) ) elif ('1' <= ch) and ('9' >= ch): if wordlen: sequence.append( Node(T_STRING, word) ) word = '' wordlen = 0 word = ch while re_obj.pos < lre: ch = re_obj.re[ re_obj.pos ] if ('0' <= ch) and ('9' >= ch): word += ch re_obj.pos += 1 else: break flag = {} flag[ 'BackReference' ] = 1 flag[ 'GroupIndex' ] = int(word, 10) sequence.append( Node(T_SPECIAL, word, flag) ) word = '' else: word += ch wordlen += 1 if wordlen: sequence.append( Node(T_STRING, word) ) word = '' wordlen = 0 if len(alternation): alternation.append( Node(T_SEQUENCE, sequence) ) sequence = [] flag = {} flag[ _G.specialChars['|'] ] = 1 return Node(T_ALTERNATION, alternation, flag) return Node(T_SEQUENCE, sequence) # https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions # https://docs.python.org/3/library/re.html # http://php.net/manual/en/reference.pcre.pattern.syntax.php # A simple regular expression analyzer class Analyzer: VERSION = "1.1.0" def __init__(self, re=None, delim='/'): self.ast = None self.re = None self.fl = None self.src = None self.grp = None self.min = None self.max = None self.ch = None if re is not None: self.input(re, delim) def __del__(self): self.dispose() def dispose( self ): self.ast = None self.re = None self.fl = None self.src = None self.grp = None self.min = None self.max = None self.ch = None return self def reset(self): self.ast = None self.src = None self.grp = None self.min = None self.max = None self.ch = None return self # alias def set(self, re, delim='/'): return self.input(re,delim) def input(self, *args): lenargs = len(args) if not lenargs: return self.re re = args[0] if lenargs > 0 else None delim = args[1] if lenargs > 1 else None if re: if delim is False: delim = False elif not delim: delim = '/' re = str(re.pattern if is_regexp(re) else re) fl = {} l = len(re) # parse re flags, if any if delim: while 0 < l: ch = re[l-1] if delim == ch: break else: fl[ ch ] = 1 l -= 1 if 0 < l: # remove re delimiters if delim == re[0] and delim == re[l-1]: re = re[1:l-1] else: re = re[0:l] else: re = '' # re is different, reset the ast, etc if self.re != re: self.reset() self.re = re self.fl = fl return self def analyze( self ): if self.re and (self.ast is None): re = RE_OBJ(self.re) self.ast = analyze_re( re ) re.dispose() return self def synthesize( self, escaped=True ): if None == self.re: return self if self.ast is None: self.analyze( ) self.src = None self.grp = None if self.src is None: state = { 'MAP' : T_SEQUENCE|T_ALTERNATION|T_GROUP|T_CHARGROUP|T_QUANTIFIER, 'REDUCE' : T_UNICODECHAR|T_HEXCHAR|T_SPECIAL|T_CHARS|T_CHARRANGE|T_STRING, 'IGNORE' : T_COMMENT, 'map' : map_src, 'reduce' : reduce_src, 'escaped' : escaped is not False, 'group' : {} } re = walk({'src':'','group':{}}, self.ast, state) self.src = re['src'] self.grp = re['group'] return self def source( self ): if not self.re: return None if self.src is None: self.synthesize() return self.src def groups( self, raw=False ): if not self.re: return None if self.grp is None: self.synthesize() return sel.grp if raw is True else self.grp.copy() def compile( self, flags=None ): if not self.re: return None flags = (self.fl if self.fl else {}) if not flags else flags return re.compile(self.source(), (re.I if ('i' in flags) or ('I' in flags) else 0) | (re.M if ('m' in flags) or ('M' in flags) else 0) | (re.S if ('s' in flags) or ('S' in flags) else 0)) def tree( self, flat=False ): if not self.re: return None if self.ast is None: self.analyze( ) return self.ast.toObject() if flat is True else self.ast # experimental feature def sample( self, maxlen=1, numsamples=1 ): if not self.re: return None if self.ast is None: self.analyze( ) state = { 'MAP' : T_SEQUENCE|T_ALTERNATION|T_GROUP|T_CHARGROUP|T_QUANTIFIER, 'REDUCE' : T_UNICODECHAR|T_HEXCHAR|T_SPECIAL|T_CHARS|T_CHARRANGE|T_STRING, 'IGNORE' : T_COMMENT, 'map' : map_any, 'reduce' : reduce_str, 'maxLength' : maxlen if maxlen else 1, 'isCaseInsensitive' : ('i' in self.fl), 'group' : {} } if 1 < numsamples: return [walk('', self.ast, state) for i in range(numsamples)] return walk('', self.ast, state) # experimental feature def minimum( self ): if not self.re: return 0 if self.ast is None: self.analyze( ) self.min = None if self.min is None: state = { 'MAP' : T_SEQUENCE|T_ALTERNATION|T_GROUP|T_CHARGROUP|T_QUANTIFIER, 'REDUCE' : T_UNICODECHAR|T_HEXCHAR|T_SPECIAL|T_CHARS|T_CHARRANGE|T_STRING, 'IGNORE' : T_COMMENT, 'map' : map_min, 'reduce' : reduce_len, 'group' : {} } self.min = walk(0, self.ast, state) return self.min # experimental feature def maximum( self ): if not self.re: return 0 if self.ast is None: self.analyze( ) self.max = None if self.max is None: state = { 'MAP' : T_SEQUENCE|T_ALTERNATION|T_GROUP|T_CHARGROUP|T_QUANTIFIER, 'REDUCE' : T_UNICODECHAR|T_HEXCHAR|T_SPECIAL|T_CHARS|T_CHARRANGE|T_STRING, 'IGNORE' : T_COMMENT, 'map' : map_max, 'reduce' : reduce_len, 'group' : {} } self.max = walk(0, self.ast, state) return self.max # experimental feature def peek( self ): if not self.re: return None if self.ast is None: self.analyze( ) self.ch = None if self.ch is None: state = { 'MAP' : T_SEQUENCE|T_ALTERNATION|T_GROUP|T_CHARGROUP|T_QUANTIFIER, 'REDUCE' : T_UNICODECHAR|T_HEXCHAR|T_SPECIAL|T_CHARS|T_CHARRANGE|T_STRING, 'IGNORE' : T_COMMENT, 'map' : map_1st, 'reduce' : reduce_peek, 'group' : {} } self.ch = walk({'positive':{},'negative':{}}, self.ast, state) peek = {'positive':copy.copy(self.ch['positive']), 'negative':copy.copy(self.ch['negative'])} isCaseInsensitive = 'i' in self.fl for n,p in peek.items(): cases = {} # either positive or negative for c in p.keys(): if '\\d' == c: del p[c] cases = concat(cases, character_range('0', '9')) elif '\\s' == c: del p[c] cases = concat(cases, ['\f','\n','\r','\t','\v','\u00A0','\u2028','\u2029']) elif '\\w' == c: del p[c] cases = concat(cases, ['_'] + character_range('0', '9') + character_range('a', 'z') + character_range('A', 'Z')) elif '\\b' == c: del p[c] cases[ _G.specialChars['b'] ] = 1 elif '\\.' == c: del p[c] cases[ _G.specialChars['.'] ] = 1 elif (ESC != c[0]) and isCaseInsensitive: cases[ c.lower() ] = 1 cases[ c.upper() ] = 1 elif ESC == c[0]: del p[c] peek[n] = concat(p, cases) return peek def flatten( a ): r = [] while len(a): if isinstance(a[0],(list,tuple)): a = list(a[0]) + a[1:] else: r.append(a[0]) a = a[1:] return r def getArgs( args ): return flatten(args) # A simple regular expression composer class Composer: VERSION = "1.1.0" def __init__( self ): self.re = None self.g = 0 self.grp = None self.level = 0 self.ast = None self.reset( ) def __del__( self ): self.dispose( ) def dispose( self ): self.re = None self.g = None self.grp = None self.level = None self.ast = None return self def reset( self ): self.g = 0 self.grp = {} self.level = 0 self.ast = [{'node': [], 'type': T_SEQUENCE, 'flag': ''}] return self def compose( self, flags=0 ): src = ''.join(self.ast[0]['node']) self.re = { 'source' : src, 'flags' : flags, 'groups' : self.grp, 'pattern' : re.compile(src, flags) } self.reset( ) return self.re def partial( self, reset=True ): re = ''.join(self.ast[0]['node']) if reset is not False: self.reset( ) return re def token( self, token, escaped=False ): if token: self.ast[self.level]['node'].append(esc_re(str(token), ESC) if escaped else str(token)) return self def match( self, token, escaped=False ): return self.token(token, escaped) def literal( self, literal ): return self.token(str(literal), True) def regexp( self, re ): return self.token(str(re), False) def sub( self, re ): return self.regexp(re) def SOL( self ): self.ast[self.level]['node'].append('^') return self def SOF( self ): return self.SOL( ) def EOL( self ): self.ast[self.level]['node'].append('$') return self def EOF( self ): return self.EOL( ) def LF( self ): self.ast[self.level]['node'].append(ESC+'n') return self def CR( self ): self.ast[self.level]['node'].append(ESC+'r') return self def TAB( self ): self.ast[self.level]['node'].append(ESC+'t') return self def CTRL( self, code='0' ): self.ast[self.level]['node'].append(ESC+'c'+str(code)) return self def HEX( self, code='0' ): self.ast[self.level]['node'].append(ESC+'x'+pad(code, 2)) return self def UNICODE( self, code='0' ): self.ast[self.level]['node'].append(ESC+'u'+pad(code, 4)) return self def backSpace( self ): self.ast[self.level]['node'].append('['+ESC+'b]') return self def any( self, multiline=False ): self.ast[self.level]['node'].append('['+ESC+'s'+ESC+'S]' if multiline else '.') return self def space( self, positive=True ): self.ast[self.level]['node'].append(ESC+'S' if not positive else ESC+'s') return self def digit( self, positive=True ): self.ast[self.level]['node'].append(ESC+'D' if not positive else ESC+'d') return self def word( self, positive=True ): self.ast[self.level]['node'].append(ESC+'W' if not positive else ESC+'w') return self def boundary( self, positive=True ): self.ast[self.level]['node'].append(ESC+'B' if not positive else ESC+'b') return self def characters( self, *args ): if T_CHARGROUP == self.ast[self.level]['type']: self.ast[self.level]['node'].append( ''.join( list(map(lambda s: esc_re(str(s), ESC, 1), getArgs(args))) ) ) return self def chars( self, *args ): if T_CHARGROUP == self.ast[self.level]['type']: self.ast[self.level]['node'].append( ''.join( list(map(lambda s: esc_re(str(s), ESC, 1), getArgs(args))) ) ) return self def range( self, start, end ): if T_CHARGROUP == self.ast[self.level]['type']: self.ast[self.level]['node'].append(esc_re(str(start), ESC, 1)+'-'+esc_re(str(end), ESC, 1)) return self def backReference( self, n ): n = str(n) self.ast[self.level]['node'].append(ESC+str(self.grp[n] if n in self.grp else n)) return self def repeat( self, min, max=None, greedy=True ): repeat = ('{'+str(min)+'}' if max is None or max == min else '{'+str(min)+','+str(max)+'}') + ('?' if not greedy else '') self.ast[self.level]['node'][len(self.ast[self.level]['node'])-1] += repeat return self def zeroOrOne( self, greedy=True ): self.ast[self.level]['node'][len(self.ast[self.level]['node'])-1] += ('??' if not greedy else '?') return self def zeroOrMore( self, greedy=True ): self.ast[self.level]['node'][len(self.ast[self.level]['node'])-1] += ('*?' if not greedy else '*') return self def oneOrMore( self, greedy=True ): self.ast[self.level]['node'][len(self.ast[self.level]['node'])-1] += ('+?' if not greedy else '+') return self def alternate( self ): self.level += 1 self.ast.append({'node': [], 'type': T_ALTERNATION, 'flag': '', 'sequences': []}) return self def either( self ): return self.alternate() def or_( self ): ast = self.ast[self.level] if (T_ALTERNATION == ast['type']) and len(ast['node']): ast['sequences'].append(''.join(ast['node'])) ast['node'] = [] return self def group( self, opts=dict(), v=None ): type = T_GROUP fl = '' if is_string(opts): fl = opts opts = {} opts[fl] = v fl = '' if ('name' in opts) and len(str(opts['name'])): self.g += 1 self.grp[str(self.g)] = self.g self.grp[str(opts['name'])] = self.g elif ('lookahead' in opts) and ((opts['lookahead'] is True) or (opts['lookahead'] is False)): fl = '?!' if opts['lookahead'] is False else '?=' elif ('lookbehind' in opts) and ((opts['lookbehind'] is True) or (opts['lookbehind'] is False)): fl = '?<!' if opts['lookbehind'] is False else '?<=' elif ('nocapture' in opts) and (opts['nocapture'] is True): fl = '?:'; elif ('characters' in opts) and ((opts['characters'] is True) or (opts['characters'] is False)): type = T_CHARGROUP fl = '^' if opts['characters'] is False else '' else: self.g += 1 self.grp[str(self.g)] = self.g self.level += 1 self.ast.append({'node': [], 'type': type, 'flag': fl}) return self def subGroup( self, opts=dict() ): return self.group( opts ) def characterGroup( self, positive=True ): return self.group({'characters':positive is not False}) def charGroup( self, positive=True ): return self.group({'characters':positive is not False}) def namedGroup( self, name ): return self.group({'name':name}) def nonCaptureGroup( self ): return self.group({'nocapture':True}) def lookAheadGroup( self, positive=True ): return self.group({'lookahead':positive is not False}) def lookBehindGroup( self, positive=True ): return self.group({'lookbehind':positive is not False}) def end( self, n=1 ): # support ending multiple blocks at once if not isinstance(n, int): n = int(n, 10) if 0 >= n: n = 1 while n : n -= 1 prev = self.ast.pop(-1) if len(self.ast) else None type = prev['type'] if prev else 0 flag = prev['flag'] if prev else '' part = prev['node'] if prev else [] if T_ALTERNATION == type: sequences = prev['sequences'] if prev else [] part = sequences if not len(part) else (sequences + [''.join(part)]) if 0 < self.level: self.level -= 1 if T_ALTERNATION == type: self.ast[self.level]['node'].append('|'.join(part)) elif T_GROUP == type: self.ast[self.level]['node'].append('('+flag+''.join(part)+')') elif T_CHARGROUP == type: self.ast[self.level]['node'].append('['+flag+''.join(part)+']') else: self.ast[self.level]['node'].append(''.join(part)) return self def startOfLine( self ): return self.SOL( ) def startOfInput( self ): return self.SOF( ) def endOfLine( self ): return self.EOL( ) def endOfInput( self ): return self.EOF( ) class Regex: """ Regular Expressipn Analyzer and Composer for Python https://github.com/foo123/Analyzer """ VERSION = "1.1.0" Node = Node Analyzer = Analyzer Composer = Composer # if used with 'import *' __all__ = ['Regex']