Source code for xdev.patterns

"""
An encapsulation of regex and glob (and maybe other) patterns.

Note:
    This implementation is maintained in kwutil and xdev. These versions should
    be kept in sync.

    See:
        ~/code/kwutil/kwutil/util_pattern.py
        ~/code/xdev/xdev/patterns.py

TODO:
    rectify with xdev / whatever package this goes in
"""
import os
import re
import fnmatch
import ubelt as ub
import pathlib

if hasattr(re, 'Pattern'):
    RE_Pattern = re.Pattern
else:
    # sys.version_info[0:2] <= 3.6
    RE_Pattern = type(re.compile('.*'))

try:
    import parse
except ImportError:
    class FakeParseModule:
        def Parser(self, *args, **kwargs):
            raise ImportError('Unable to import parse')
    parse = FakeParseModule()


[docs] class PatternBase: """ Abstract class that defines the Pattern api """
[docs] def match(self, text): raise NotImplementedError
[docs] def search(self, text): raise NotImplementedError
[docs] def sub(self, repl, text): raise NotImplementedError
# # TODO: wrapper for results of re and fnmatch # class Match(ub.NiceRepr): # def __init__(self, string, pos, endpos): # self.string = string # self.pos = pos # self.endpos = endpos # self.lastgroup # self.lastindex # def span(self): # return (self.pos, self.endpos) # def __nice__(self): # return self.string # def __bool__(self): # return True
[docs] def _maybe_expandable_glob(pat): """ Determine if a string might be a expandable glob pattern by looking for special glob characters: *, ? and []. Note: ! is also special, but always inside of a [] braket, so we dont need to check it. Returns: bool: if False then the input is 100% not an expandable glob pattern (although it could still be a glob pattern, but it is equivalant to strict matching). if True, then there are special glob characters in the string, but it is not guarenteed to be a valid glob pattern. """ return ('*' in pat or '?' in pat or ('[' in pat and ']' in pat))
[docs] class Pattern(PatternBase, ub.NiceRepr): """ Provides a common API to several common pattern matching syntaxes. A general patterns class, which can use a backend from BACKENDS Args: pattern (str | object): The pattern text or a precompiled backend pattern object backend (str): Code indicating what backend the pattern text should be interpereted with. See BACKENDS for available choices. Notes: # BACKENDS The glob backend uses the :mod:`fnmatch` module [fnmatch_docs]_. The regex backend uses the Python :mod:`re` module. The strict backend uses the "==" string equality testing. The parse backend uses the :mod:`parse` module. References: .. [fnmatch_docs] https://docs.python.org/3/library/fnmatch.html Example: >>> # Test Regex backend >>> repat = Pattern.coerce('foo.*', 'regex') >>> assert repat.match('foobar') >>> assert not repat.match('barfoo') >>> match = repat.search('baz-biz-foobar') >>> match = repat.match('baz-biz-foobar') >>> # Test Glob backend >>> globpat = Pattern.coerce('foo*', 'glob') >>> assert globpat.match('foobar') >>> assert not globpat.match('barfoo') >>> globpat = Pattern.coerce('[foo|bar]', 'glob') >>> globpat.match('foo') Example: >>> # xdoctest: +REQUIRES(module:parse) >>> # Test parse backend >>> pattern1 = Pattern.coerce('A {adjective} pattern', 'parse') >>> result1 = pattern1.match('A cool pattern') >>> print(f'result1.named = {ub.urepr(result1.named, nl=1)}') >>> pattern2 = pattern1.to_regex() >>> result2 = pattern2.match('A cool pattern') """ def __init__(self, pattern, backend): if isinstance(pattern, pathlib.Path): pattern = os.fspath(pattern) if backend == 'regex': if isinstance(pattern, str): pattern = re.compile(pattern) elif backend == 'parse': if isinstance(pattern, str): pattern = parse.Parser(pattern) self.pattern = pattern self.backend = backend def __nice__(self) -> str: return '{}, {}'.format(self.pattern, self.backend)
[docs] def to_regex(self): """ Returns an equivalent pattern with the regular expression backend Returns: Pattern Example: >>> globpat = Pattern.coerce('foo*', 'glob') >>> strictpat = Pattern.coerce('foo*', 'strict') >>> repat1 = strictpat.to_regex() >>> repat2 = globpat.to_regex() >>> print(f'repat1={repat1}') >>> print(f'repat2={repat2}') """ if self.backend == 'regex': regex_pattern = self.pattern elif self.backend == 'parse': # regex_pattern = self.pattern._generate_expression() regex_pattern = self.pattern._expression elif self.backend == 'glob': regex_pattern = fnmatch.translate(self.pattern) elif self.backend == 'strict': regex_pattern = re.escape(self.pattern) else: raise AssertionError new = self.__class__(regex_pattern, 'regex') return new
[docs] @classmethod def from_regex(cls, data, flags=0, multiline=False, dotall=False, ignorecase=False): """ Create a Pattern object with a regex backend. """ if multiline: flags |= re.MULTILINE if dotall: flags |= re.DOTALL if multiline: flags |= re.DOTALL if ignorecase: flags |= re.IGNORECASE pat = re.compile(data, flags=flags) self = cls(pat, 'regex') return self
[docs] @classmethod def from_glob(cls, data): """ Create a Pattern object with a glob backend. """ self = cls(data, 'glob') return self
[docs] @classmethod def coerce_backend(cls, data, hint='auto'): """ Example: >>> assert Pattern.coerce_backend('foo', hint='auto') == 'strict' >>> assert Pattern.coerce_backend('foo*', hint='auto') == 'glob' >>> assert Pattern.coerce_backend(re.compile('foo*'), hint='auto') == 'regex' """ if isinstance(data, RE_Pattern): backend = 'regex' elif isinstance(data, cls) or type(data).__name__ == cls.__name__: backend = data.backend else: if hint == 'auto': hint = 'glob' if isinstance(data, str): if not _maybe_expandable_glob(data): hint = 'strict' backend = hint return backend
[docs] @classmethod def coerce(cls, data, hint='auto'): """ Attempt to automatically determine the input data as the appropriate pattern. If it cannot be determined, then fallback to the hint. Args: data (str | Pattern | PathLike) hint (str): can be 'glob', 'regex', 'strict' or 'auto'. In 'auto' we will use 'glob' if the input is a string and '*' is in the pattern, otherwise we will use strict. Pattern inputs keep their existing interpretation. Example: >>> pat = Pattern.coerce('foo*', 'glob') >>> pat2 = Pattern.coerce(pat, 'regex') >>> print('pat = {}'.format(ub.urepr(pat, nl=1))) >>> print('pat2 = {}'.format(ub.urepr(pat2, nl=1))) """ if isinstance(data, cls) or type(data).__name__ == cls.__name__: self = data else: # string backend = cls.coerce_backend(data, hint=hint) self = cls(data, backend) return self
[docs] def match(self, text): # TODO standardize return value with a Result class. if self.backend == 'regex': return self.pattern.match(text) elif self.backend == 'parse': return self.pattern.parse(text) elif self.backend == 'glob': return fnmatch.fnmatch(text, self.pattern) elif self.backend == 'strict': return self.pattern == text else: raise KeyError(self.backend)
[docs] def search(self, text): if self.backend == 'regex': return self.pattern.search(text) elif self.backend == 'parse': return self.pattern.search(text) elif self.backend == 'glob': return fnmatch.fnmatch(text, '*{}*'.format(self.pattern)) elif self.backend == 'strict': return self.pattern in text else: raise KeyError(self.backend)
[docs] def sub(self, repl, text, count=-1): """ Args: repl (str): text to insert in place of pattern text (str): text to be searched and modified count (int): if non-negative, the maximum number of replacements that will be made. """ if count == 0: return text # make regex conform to the API if self.backend == 'regex': return self.pattern.sub(repl, text, count=max(0, count)) elif self.backend == 'parse': raise NotImplementedError elif self.backend == 'glob': raise NotImplementedError elif self.backend == 'strict': return text.replace(self.pattern, repl, count=count) else: raise KeyError(self.backend)
[docs] def paths(self, cwd=None, recursive=False): """ Find paths in the filesystem that match this pattern Yields: ub.Path """ from ubelt.util_path import ChDir if self.backend == 'glob': import glob with ChDir(cwd): yield from map(ub.Path, glob.glob( self.pattern, recursive=recursive)) elif self.backend == 'strict': with ChDir(cwd): p = ub.Path(self.pattern) if p.exists(): yield p else: raise NotImplementedError
[docs] class MultiPattern(PatternBase, ub.NiceRepr): """ Example: >>> dpath = ub.Path.appdir('xdev/tests/multipattern_paths').ensuredir().delete().ensuredir() >>> (dpath / 'file0.txt').touch() >>> (dpath / 'data0.dat').touch() >>> (dpath / 'other0.txt').touch() >>> ((dpath / 'dir1').ensuredir() / 'file1.txt').touch() >>> ((dpath / 'dir2').ensuredir() / 'file2.txt').touch() >>> ((dpath / 'dir2').ensuredir() / 'file3.txt').touch() >>> ((dpath / 'dir1').ensuredir() / 'data.dat').touch() >>> ((dpath / 'dir2').ensuredir() / 'data.dat').touch() >>> ((dpath / 'dir2').ensuredir() / 'data.dat').touch() >>> pat = MultiPattern.coerce(['*.txt'], 'glob') >>> print(list(pat.paths(cwd=dpath))) >>> pat = MultiPattern.coerce(['*0*', '**/*.txt'], 'glob') >>> print(list(pat.paths(cwd=dpath, recursive=1))) >>> pat = MultiPattern.coerce(['*.txt', '**/*.txt', '**/*.dat'], 'glob') >>> print(list(pat.paths(cwd=dpath))) """ def __init__(self, patterns, predicate): self.predicate = predicate self.patterns = patterns def __nice__(self): return f'{self.predicate.__name__}({[str(p) for p in self.patterns]})'
[docs] def match(self, text): # TODO: when predictate is any, return the first truthy match object # When it is all, not sure how to make that work nicely. return self.predicate(p.match(text) for p in self.patterns)
[docs] def paths(self, cwd=None, recursive=False): groups = (p.paths(cwd=cwd, recursive=recursive) for p in self.patterns) if self.predicate in {any}: # all}: yield from ub.unique(ub.flatten(groups)) elif self.predicate in {all}: # all}: yield from set.intersection(*map(set, groups)) else: raise NotImplementedError
# def search(self, text): # return self.predicate(p.search(text) for p in self.patterns)
[docs] def _squeeze(self): if self.predicate in {any, all}: if len(self.patterns) == 1: new = self.patterns[0] else: new = self else: raise NotImplementedError return new
[docs] @classmethod def coerce(cls, data, hint='auto', predicate='any'): """ Args: data (str | List | Pattern | PathLike | MultiPattern) hint (str): can be 'glob', 'regex', 'strict' or 'auto'. In 'auto' we will use 'glob' if the input is a string and '*' is in the pattern, otherwise we will use strict. Pattern inputs keep their existing interpretation. Returns: MultiPattern Example: >>> pat = MultiPattern.coerce('foo*', 'glob') >>> pat2 = MultiPattern.coerce(pat, 'regex') >>> pat3 = MultiPattern.coerce([pat, pat], 'regex') >>> pat4 = MultiPattern.coerce([ub.Path('bar*'), pat], 'regex') >>> print('pat = {}'.format(ub.urepr(pat, nl=1))) >>> print('pat2 = {}'.format(ub.urepr(pat2, nl=1))) >>> print('pat3 = {!r}'.format(pat3)) >>> print('pat4 = {!r}'.format(pat4)) >>> pat00 = MultiPattern.coerce('foo', 'glob') >>> pat01 = MultiPattern.coerce('foo*', 'glob') >>> pat02 = MultiPattern.coerce('foo*', 'regex') >>> pat5 = MultiPattern.coerce(['foo', 'foo*', pat, pat00, pat01, pat02]) >>> print(f'pat5={pat5}') Example: >>> # Test all acceptable input types >>> import itertools as it >>> str_pat = 'pattern*' >>> scalar_inputs = { >>> 'str': str_pat, >>> 'path': ub.Path(str_pat), >>> 'pat': Pattern.coerce(str_pat), >>> 'mpat': MultiPattern.coerce(str_pat) >>> } >>> # Test scalar input types >>> scalar_outputs = {} >>> for k, v in scalar_inputs.items(): >>> scalar_outputs[k] = MultiPattern.coerce(v) >>> print('scalar_outputs = {}'.format(ub.urepr(scalar_outputs, nl=1))) >>> # >>> # Test iterable input types >>> multi_outputs = [] >>> for v in it.combinations(scalar_inputs.values(), 2): >>> multi_outputs.append(MultiPattern.coerce(v)) >>> for v in it.combinations(scalar_inputs.values(), 3): >>> multi_outputs.append(MultiPattern.coerce(v)) >>> # Higher order nesting test >>> higher_order_output = MultiPattern.coerce(multi_outputs) >>> print('higher_order_output = {}'.format(ub.urepr(higher_order_output, nl=1))) """ if isinstance(data, cls) or type(data).__name__ == cls.__name__: self = data else: # coerce predicate if predicate == 'any': predicate = any else: raise NotImplementedError if isinstance(data, (str, os.PathLike, Pattern)): backend = Pattern.coerce_backend(data, hint=hint) pat = Pattern.coerce(data, backend) patterns = [pat] self = MultiPattern(patterns, predicate) else: self = MultiPattern([ MultiPattern.coerce(d, hint)._squeeze() for d in data], predicate) return self