"""
Helpers to build cross-flavor regular expressions.
"""
import re
[docs]
class RegexBuilder:
"""
Notes:
The way to have multiple negative look aheads/behinds is to change them together SO12689046
References:
.. [SO12689046] https://stackoverflow.com/questions/12689046/multiple-negative-lookbehind-assertions-in-python-regex
Example:
b = RegexBuilder.coerce('python')
import re
pat = re.compile('[A-Z-]+')
"""
common_patterns = [
{'key': 'word', 'pattern': r'\w', 'docs': r'An alphanumeric word, i.e. [a-zA-Z0-9_] (also matches unicode characters in Python)'},
{'key': 'non-word', 'pattern': r'\W', 'docs': r'Anything not a word'},
{'key': 'space', 'pattern': r'\s', 'docs': r'Any space character including: " " "\t", "\n", "\r"'},
{'key': 'non-space', 'pattern': r'\S', 'docs': r'Any non-space character'},
{'key': 'digit', 'pattern': r'\d', 'docs': r'any number 0-9'},
{'key': 'digit', 'pattern': r'\D', 'docs': r'any non-digit'},
{'key': 'zero_or_more', 'pattern': r'*', 'docs': r'zero or more of the pattern to the left', 'alias': ['kleene_star']},
]
def __init__(self):
raise Exception('Use ``RegexBuilder.coerce(backend=...)`` instead')
[docs]
def lookahead(self, pat, positive=True, mode='positive'):
"""
A lookahead pattern that can be positive or negative
looklook
"""
if positive is not None:
import ubelt as ub
ub.schedule_deprecation(
'xdev', 'positive', 'arg to lookbehind',
migration='use mode=positive or mode=negative instead',
deprecate='now')
mode = 'positive' if positive else 'negative'
if mode == 'positive':
return self.constructs['positive_lookahead'].format(pat=pat)
elif mode == 'negative':
return self.constructs['negative_lookahead'].format(pat=pat)
else:
raise KeyError(mode)
[docs]
def lookbehind(self, pat, positive=True):
"""
A lookbehind pattern that can be positive or negative
"""
if positive is not None:
import ubelt as ub
ub.schedule_deprecation(
'xdev', 'positive', 'arg to lookbehind',
migration='use mode=positive or mode=negative instead',
deprecate='now')
mode = 'positive' if positive else 'negative'
if mode == 'positive':
return self.constructs['positive_lookbehind'].format(pat=pat)
elif mode == 'negative':
return self.constructs['negative_lookbehind'].format(pat=pat)
else:
raise KeyError(mode)
[docs]
def named_field(self, pat, name=None):
if name is None:
return self.constructs['group'].format(pat=pat)
else:
return self.constructs['named_field'].format(pat=pat, name=name)
[docs]
def bref_field(self, name):
return self.constructs['backref_field'].format(name=name)
[docs]
def escape(self, pat):
return re.escape(pat)
[docs]
def optional(self, pat):
return r'{pat}?'.format(pat=pat)
[docs]
def group(self, pat):
return self.constructs['group'].format(pat=pat)
[docs]
def oneof(self, *paterns):
return self.group('|'.join(paterns))
[docs]
@classmethod
def coerce(cls, backend='python'):
if backend == 'python':
cls = PythonRegexBuilder
elif backend == 'vim':
cls = VimRegexBuilder
else:
raise KeyError(backend)
self = cls()
return self
@property
def identifier(self):
"""
A word, except it must start with a letter or underscore (not a number)
References:
https://stackoverflow.com/questions/5474008/regular-expression-to-confirm-whether-a-string-is-a-valid-python-identifier
Example:
>>> from xdev.regex_builder import * # NOQA
>>> b = PythonRegexBuilder()
>>> assert re.match(b.identifier, 'hello')
>>> assert re.match(b.identifier, 'hello')
>>> assert re.match(b.identifier, '𝛣_ello')
>>> assert re.match(b.identifier, 'h_1e8llo')
>>> assert not re.match(b.identifier, '1hello')
"""
return r'[^\d\W]\w*'
# return '[A-Za-z_][A-Za-z_0-9]*'
@property
def hex(self):
"""
A case-independent hex character
"""
return '[0-9a-fA-F]'
@property
def word(self):
return self.special['word']
@property
def whitespace(self):
return self.special['space'] + '*'
@property
def nongreedy(self):
return self.special['nongreedy_zero_or_more']
@property
def number(self):
"""
Can match a generic floating point number
References:
https://www.regular-expressions.info/floatingpoint.html
Example:
>>> from xdev.regex_builder import * # NOQA
>>> b = PythonRegexBuilder()
>>> pat = re.compile('^' + b.number + '$')
>>> assert pat.match('3.4')
>>> assert pat.match('3.4e-1')
>>> assert pat.match('3.4')
>>> assert pat.match('3.4e+1')
>>> assert not pat.match('3.4a+1')
>>> b = PythonRegexBuilder()
>>> num_part = b.named_field(b.number, name='number')
>>> space_part = b.named_field(' *', name='spaces')
>>> unit_part = b.named_field('.*', name='unit')
>>> pat = re.compile('^' + num_part + space_part + unit_part + '$')
>>> pat.match('3.4').groupdict()
>>> pat.match('3.1415 foobars').groupdict()
>>> pat.match('3.1415foobars').groupdict()
>>> pat.match('+3.1415e9foobars').groupdict()
"""
exponent_part = '[eE][-+]?[0-9]+'
decimal_part = r'[-+]?[0-9]*\.?[0-9]+'
exponent_group = self.constructs['group'].format(pat=exponent_part)
number_pat = decimal_part + exponent_group + '?'
return number_pat
[docs]
class VimRegexBuilder(RegexBuilder):
"""
https://dev.to/iggredible/learning-vim-regex-26ep
"""
vim_patterns = [
{'key': 'nongreedy_zero_or_more', 'pattern': r'\{-}', 'docs': r'non-greedily matches zero or more of the pattern to the left', 'alias': ['nongreedy_kleene_star']},
]
def __init__(self):
self.constructs = {}
self.constructs['positive_lookahead'] = r'\({pat}\)\@='
self.constructs['negative_lookahead'] = r'\({pat}\)\@!'
self.constructs['positive_lookbehind'] = r'\({pat}\)\@<='
self.constructs['negative_lookbehind'] = r'\({pat}\)\@<!'
self.constructs['word'] = r'\<{pat}\>'
self.constructs['group'] = r'\({pat}\)'
self.special = {}
for item in self.common_patterns + self.vim_patterns:
self.special[item['key']] = item['pattern']
[docs]
def previous(self, min=None, max=None, exact=None, greedy=True):
r"""
Match the previous pattern some number of times.
Args:
min (int | None): minimum number of matches
max (int | None): maximum number of matches
exact (int | None):
Specify exact number of matches.
Mutex with minimum and max.
greedy (bool):
if True match as many as possible, otherwise match as few as
possible
Example:
>>> from xdev.regex_builder import * # NOQA
>>> b = VimRegexBuilder()
>>> assert b.previous(exact=1) == r'\{1}'
>>> assert b.previous(min=1, max=3) == r'\{1,3}'
>>> assert b.previous(min=1, max=3, greedy=False) == r'\{-1,3}'
>>> assert b.previous(max=3) == r'\{,3}'
>>> assert b.previous(min=3) == r'\{3,}'
>>> assert b.previous() == '*'
>>> assert b.previous(greedy=False) == r'\{-}'
"""
if exact is not None:
assert min is None and max is None
expr = f'\\{{{exact}}}'
else:
if min is None:
min = 0
if max == float('inf'):
max = None
if min == 0 and max is None:
return '*' if greedy else '\\{-}'
greed = '' if greedy else '-'
if max is None:
expr = f'\\{{{greed}{min},}}'
elif min == 0:
expr = f'\\{{{greed},{max}}}'
else:
expr = f'\\{{{greed}{min},{max}}}'
return expr
[docs]
class PythonRegexBuilder(RegexBuilder):
r"""
Contains helper methods to construct a regex
Example:
>>> b = PythonRegexBuilder()
>>> pat_text = b.lookbehind('_') + r'v\d+' + b.optional(b.lookahead('_'))
>>> pat = re.compile(pat_text)
>>> print(pat.search('_v321_').group())
v321
>>> print(pat.search('_v321').group())
v321
>>> print(pat.search('fdsfds_v321_fdsfsd').group())
v321
>>> print(pat.search('fdsfds_v321fdsfsd').group())
v321
>>> print(pat.search('fdsfdsv321fdsfsd'))
None
Example:
>>> # Test multiple negative lookbehind
>>> b = PythonRegexBuilder()
>>> suffix = 'foo'
>>> neg_prefix1 = b.lookbehind('abc', positive=0)
>>> neg_prefix2 = b.lookbehind('efg', positive=0)
>>> pat1 = re.compile(neg_prefix1 + suffix)
>>> pat2 = re.compile(neg_prefix2 + suffix)
>>> patB = re.compile(neg_prefix1 + neg_prefix2 + suffix)
>>> cases = ['abcfoo', 'efgfoo', 'hijfoo', 'foo']
>>> print([bool(pat1.search(c)) for c in cases])
>>> print([bool(pat2.search(c)) for c in cases])
>>> print([bool(patB.search(c)) for c in cases])
[False, True, True, True]
[True, False, True, True]
[False, False, True, True]
References:
https://www.dataquest.io/blog/regex-cheatsheet/
https://docs.python.org/3/library/re.html#regular-expression-syntax
"""
python_patterns = [
{'key': 'nongreedy_zero_or_more', 'pattern': r'*?', 'docs': r'non-greedily matches zero or more of the pattern to the left', 'alias': ['nongreedy_kleene_star']},
{'key': 'boundary', 'pattern': r'\b', 'docs': r'The boundary at the start or end of a word'},
{'key': 'non-boundary', 'pattern': r'\B'},
{'key': 'left-expr', 'pattern': r'\A'},
{'key': 'right-expr', 'pattern': r'\Z', 'docs': 'Matches only at the end of the string'},
]
def __init__(self):
self.constructs = {}
self.constructs['positive_lookahead'] = r'(?={pat})'
self.constructs['negative_lookahead'] = r'(?!{pat})'
self.constructs['positive_lookbehind'] = r'(?<={pat})'
self.constructs['negative_lookbehind'] = r'(?<!{pat})'
self.constructs['word'] = r'\b{pat}\b'
self.constructs['group'] = r'({pat})'
self.constructs['named_field'] = r'(?P<{name}>{pat})'
self.constructs['backref_field'] = r'\g<{name}>'
self.special = {}
for item in self.common_patterns + self.python_patterns:
self.special[item['key']] = item['pattern']
[docs]
def previous(self, min=None, max=None, exact=None, greedy=True):
r"""
Match the previous pattern some number of times.
Args:
min (int | None): minimum number of matches
max (int | None): maximum number of matches
exact (int | None):
Specify exact number of matches.
Mutex with minimum and max.
greedy (bool):
if True match as many as possible, otherwise match as few as
possible
Example:
>>> from xdev.regex_builder import * # NOQA
>>> b = PythonRegexBuilder()
>>> assert b.previous(exact=1) == '{1}'
>>> assert b.previous(min=1, max=3) == '{1,3}'
>>> assert b.previous(min=1, max=3, greedy=False) == '{1,3}?'
>>> assert b.previous(max=3) == '{,3}'
>>> assert b.previous(min=3) == '{3,}'
>>> assert b.previous() == '*'
>>> assert b.previous(greedy=False) == '*?'
Example:
>>> from xdev.regex_builder import * # NOQA
>>> b = PythonRegexBuilder()
>>> assert re.compile('a' + b.previous(exact=2) + '$').match('aa')
>>> assert not re.compile('a' + b.previous(exact=2) + '$').match('aaa')
>>> assert not re.compile('a' + b.previous(exact=2) + '$').match('a')
"""
if exact is not None:
assert min is None and max is None
expr = f'{{{exact}}}'
else:
if min is None:
min = 0
if max == float('inf'):
max = None
if min == 0 and max is None:
return '*' if greedy else '*?'
if max is None:
expr = f'{{{min},}}'
elif min == 0:
expr = f'{{,{max}}}'
else:
expr = f'{{{min},{max}}}'
if not greedy:
expr = expr + '?'
return expr