Source code for etk.extractors.regex_extractor

from typing import List
from enum import Enum, auto
from etk.extractor import Extractor, InputType
from etk.extraction import Extraction
import re
import collections

class MatchMode(Enum):
    MATCH = auto(),
    SEARCH = auto(),
    FINDALL = auto(),
    SPLIT = auto()

[docs]class RegexExtractor(Extractor): """ **Description** A wrapper for Python regular expressions. Examples: :: pattern = "some_pattern" regex_extractor = RegexExtractor(pattern=pattern, flags=re.IGNORECASE) regex_extractor.extract(text=input_doc, flags=re.M, MatchMode=MatchMode.SEARCH ) """ def __init__(self, pattern: str, extractor_name: str, flags=0, general_tag: str = None ) -> None: Extractor.__init__(self, input_type=InputType.TEXT, category="regex", name=extractor_name) self._compiled_regex = re.compile(pattern, flags) self._general_tag = general_tag self._match_functions = { MatchMode.MATCH: self._compiled_regex.match, MatchMode.SEARCH:, MatchMode.FINDALL: self._compiled_regex.finditer, MatchMode.SPLIT: self._compiled_regex.split } @property def general_tag(self): return self._general_tag
[docs] def extract(self, text: str, flags=0, mode: MatchMode = MatchMode.FINDALL) -> List[Extraction]: """ Extracts information from a text using the given regex. If the pattern has no groups, it returns a list with a single Extraction. If the pattern has groups, it returns a list of Extraction, one for each group. Each extraction records the start and end char positions of matches. Args: text (str): the text to extract from. flags (enum['a', 'i', 'L', 'm', 's', 'u', 'x']): flags given to search or match. The value should be one \ or more letters from the set 'a', 'i', 'L', 'm', 's', 'u', 'x'.) The group matches the empty string; \ the letters set the corresponding flags: re.A (ASCII-only matching), re.I (ignore case), re.L (locale dependent),\ re.M (multi-line), re.S (dot matches all), re.U (Unicode matching), and re.X (verbose), for the entire \ regular expression. mode (enum[MatchMode.MATCH, MatchMode.SEARCH, MatchMode.FINDALL, MatchMode.SPLIT]): whether to use or re.match(). Returns: List(Extraction): the list of extraction or the empty list if there are no matches. """ match_func = self._match_functions[mode] matches = match_func(text, flags) return self._wrap_result(matches)
# wrap the re return object to list of extraction def _wrap_result(self, matches: object) -> List[Extraction]: res = list() # matches are result of split() if isinstance(matches, list): return self._wrap_split_extraction(matches) # matches are result of finditer() elif isinstance(matches, collections.Iterable): for match in matches: es = self._wrap_result(match) res.extend(es) # single match else: # check if the pattern has groups groups = matches.groups() if groups: for i in range(1, len(groups) + 1): res.append(self._wrap_extraction(i, matches)) else: res.append(self._wrap_extraction(0, matches)) return res def _wrap_split_extraction(self, items: List[str]) -> List[Extraction]: res = list() start = 0 for item in items: end = start + len(item) e = Extraction(value=item,, start_char=start, end_char=end) res.append(e) start = end return res def _wrap_extraction(self, group_idx: int, matches: object) -> Extraction: start, end = matches.start(group_idx), matches.end(group_idx) text = e = Extraction(value=text,, \ start_char=start, end_char=end, tag=self.general_tag) return e