Source code for etk.extractors.sentence_extractor

from typing import List
from etk.extraction import Extraction
from etk.extractor import Extractor, InputType

import copy
import spacy


[docs]class SentenceExtractor(Extractor): """ **Description** Extract individual sentences using lightweight spaCy module. Example: :: sentence_extractor = SentenceExtractor(custom_nlp=nlp) sentence_extractor.extract(text=text) """ def __init__(self, name: str = None, custom_nlp: type = None) -> None: Extractor.__init__(self, input_type=InputType.TEXT, category="Text extractor", name=name if name else "Sentence extractor") load_parser = False if custom_nlp: try: custom_pipeline = copy.deepcopy(custom_nlp) pipe_names = custom_pipeline.pipe_names for pipe in pipe_names: if pipe != "parser": custom_pipeline.remove_pipe(pipe) try: assert "parser" in custom_pipeline.pipe_names self._parser = custom_pipeline except AssertionError: print("Note: custom_pipeline does not have a parser. \n" "Loading parser from en_core_web_sm... ") load_parser = True except AttributeError as e: print("Note: custom_pipeline does not have expected " "attributes.") print(e) print("Loading parser from en_core_web_sm...") load_parser = True else: load_parser = True if load_parser: self._parser = spacy.load("en_core_web_sm", disable=["tagger", "ner"])
[docs] def extract(self, text: str) -> List[Extraction]: """ Splits text by sentences. Args: text (str): Input text to be extracted. Returns: List[Extraction]: the list of extraction or the empty list if there are no matches. """ doc = self._parser(text) extractions = list() for sent in doc.sents: this_extraction = Extraction(value=sent.text, extractor_name=self.name, start_token=sent[0], end_token=sent[-1], start_char=sent.text[0], end_char=sent.text[-1]) extractions.append(this_extraction) return extractions