Source code for etk.extractors.email_extractor

from etk.extractor import Extractor, InputType
from etk.extraction import Extraction
from spacy.matcher import Matcher
from spacy.attrs import LIKE_EMAIL
from typing import List
import copy


FILTER_PROVIDER = ["noon", "no"]


[docs]class EmailExtractor(Extractor): """ **Description** This class uses spaCy Matcher and takes spaCy predefined 'LIKE_EMAIL' pattern to extract email address. More information: https://spacy.io/api/matcher#add Examples: :: email_extractor = EmailExtractor(...) email_extractor.extract(text=input_doc,...) """ def __init__(self, nlp, tokenizer, extractor_name: str) -> None: """ Initialize the extractor, storing the rule information and construct spacy rules Args: nlp: tokenizer: Tokenizer extractor_name: str Returns: """ Extractor.__init__(self, input_type=InputType.TEXT, category="build_in_extractor", name=extractor_name) self._nlp = copy.deepcopy(nlp) self._like_email_matcher = Matcher(self._nlp.vocab) self._tokenizer = tokenizer def _load_email_matcher(self): self._like_email_matcher.add("Email", None, [{LIKE_EMAIL: True}])
[docs] def extract(self, text: str) -> List[Extraction]: """ Args: text (str): The input source to be processed Returns: List[Extraction]: The list of extractions returned by EmailExtractor """ result = [] first_phase_doc = self._nlp(text) self._load_email_matcher() like_email_matches = self._like_email_matcher(first_phase_doc) like_emails_filtered = [] for match_id, start, end in like_email_matches: span = first_phase_doc[start:end] if self._check_domain(self._tokenizer.tokenize(span.text)): like_emails_filtered.append((span.text, span[0].idx, span[-1].idx + len(span[-1]))) non_space_emails = self._get_non_space_email(first_phase_doc) emails = set(like_emails_filtered).union(non_space_emails) for email in emails: result.append(Extraction( value=email[0], extractor_name=self.name, start_char=email[1], end_char=email[2]) ) return result
@staticmethod def _check_domain(tokens) -> bool: """ Check if the email provider should be filtered Args: tokens: Returns: Bool """ idx = None for e in tokens: if e.text == "@": idx = e.i break if not idx or tokens[idx+1].text in FILTER_PROVIDER: return False else: return True def _get_non_space_email(self, doc) -> List: """ Deal with corner case that there is "email" string in text and no space around it Args: doc: List[Token] Returns: Bool """ result_lst = [] for e in doc: if "mail:" in e.text.lower(): idx = e.text.lower().index("mail:") + 5 value = e.text[idx:] tmp_doc = self._nlp(value) tmp_email_matches = self._like_email_matcher(tmp_doc) for match_id, start, end in tmp_email_matches: span = tmp_doc[start:end] if self._check_domain(self._tokenizer.tokenize(span.text)): result_lst.append((span.text, idx+e.idx, idx+e.idx+len(value))) return result_lst