Source code for etk.extractors.regex_extractor
from typing import List
from enum import Enum, auto
from etk.extractor import Extractor, InputType
from etk.extraction import Extraction
import re
import collections
class MatchMode(Enum):
MATCH = auto(),
SEARCH = auto(),
FINDALL = auto(),
SPLIT = auto()
[docs]class RegexExtractor(Extractor):
"""
**Description**
A wrapper for Python regular expressions.
Examples:
::
pattern = "some_pattern"
regex_extractor = RegexExtractor(pattern=pattern,
flags=re.IGNORECASE)
regex_extractor.extract(text=input_doc,
flags=re.M,
MatchMode=MatchMode.SEARCH
)
"""
def __init__(self,
pattern: str,
extractor_name: str,
flags=0,
general_tag: str = None
) -> None:
Extractor.__init__(self,
input_type=InputType.TEXT,
category="regex",
name=extractor_name)
self._compiled_regex = re.compile(pattern, flags)
self._general_tag = general_tag
self._match_functions = {
MatchMode.MATCH: self._compiled_regex.match,
MatchMode.SEARCH: self._compiled_regex.search,
MatchMode.FINDALL: self._compiled_regex.finditer,
MatchMode.SPLIT: self._compiled_regex.split
}
@property
def general_tag(self):
return self._general_tag
[docs] def extract(self, text: str, flags=0, mode: MatchMode = MatchMode.FINDALL) -> List[Extraction]:
"""
Extracts information from a text using the given regex.
If the pattern has no groups, it returns a list with a single Extraction.
If the pattern has groups, it returns a list of Extraction, one for each group.
Each extraction records the start and end char positions of matches.
Args:
text (str): the text to extract from.
flags (enum['a', 'i', 'L', 'm', 's', 'u', 'x']): flags given to search or match. The value should be one \
or more letters from the set 'a', 'i', 'L', 'm', 's', 'u', 'x'.) The group matches the empty string; \
the letters set the corresponding flags: re.A (ASCII-only matching), re.I (ignore case), re.L (locale dependent),\
re.M (multi-line), re.S (dot matches all), re.U (Unicode matching), and re.X (verbose), for the entire \
regular expression.
mode (enum[MatchMode.MATCH, MatchMode.SEARCH, MatchMode.FINDALL, MatchMode.SPLIT]): whether to use re.search() or re.match().
Returns:
List(Extraction): the list of extraction or the empty list if there are no matches.
"""
match_func = self._match_functions[mode]
matches = match_func(text, flags)
return self._wrap_result(matches)
# wrap the re return object to list of extraction
def _wrap_result(self, matches: object) -> List[Extraction]:
res = list()
# matches are result of split()
if isinstance(matches, list):
return self._wrap_split_extraction(matches)
# matches are result of finditer()
elif isinstance(matches, collections.Iterable):
for match in matches:
es = self._wrap_result(match)
res.extend(es)
# single match
else:
# check if the pattern has groups
groups = matches.groups()
if groups:
for i in range(1, len(groups) + 1):
res.append(self._wrap_extraction(i, matches))
else:
res.append(self._wrap_extraction(0, matches))
return res
def _wrap_split_extraction(self, items: List[str]) -> List[Extraction]:
res = list()
start = 0
for item in items:
end = start + len(item)
e = Extraction(value=item, extractor_name=self.name, start_char=start, end_char=end)
res.append(e)
start = end
return res
def _wrap_extraction(self, group_idx: int, matches: object) -> Extraction:
start, end = matches.start(group_idx), matches.end(group_idx)
text = matches.group(group_idx)
e = Extraction(value=text, extractor_name=self.name, \
start_char=start, end_char=end, tag=self.general_tag)
return e