Source code for etk.extractors.spacy_ner_extractor
import spacy
from etk.extractor import Extractor, InputType
from etk.extraction import Extraction
from typing import List
[docs]class SpacyNerExtractor(Extractor):
"""
**Description**
This extractor takes a list of spaCy NER tag as reference, and extract
the tag matched substring from the input text
Examples:
::
get_attr = ['PERSON', 'ORG', 'GPE']
spacy_ner_extractor = SpacyNerExtractor()
spacy_ner_extractor.extract(text=text, get_attr=get_attr)
"""
def __init__(self, extractor_name: str, nlp=spacy.load('en_core_web_sm')):
Extractor.__init__(self, input_type=InputType.TEXT,
category="built_in_extractor",
name=extractor_name)
self.__nlp = nlp
# all_attrs = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 'LANGUAGE',
# 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']
[docs] def extract(self, text: str, get_attr=['PERSON', 'ORG', 'GPE']) -> List[Extraction]:
"""
Args:
text (str): the text to extract from.
get_attr (List[str]): The spaCy NER attributes we're interested in.
Returns:
List(Extraction): the list of extraction or the empty list if there are no matches.
"""
doc = self.__nlp(text)
attr_list = list()
for ent in doc.ents:
if ent.label_ in get_attr:
attr_list.append(Extraction(extractor_name=self.name,
start_char=int(ent.start_char),
end_char=int(ent.end_char),
value=ent.text,
tag=ent.label_))
return attr_list