Source code for etk.extractors.glossary_extractor

from warnings import warn
from typing import List
from etk.extractor import Extractor, InputType
from etk.extraction import Extraction
from etk.tokenizer import Tokenizer
from etk.etk_exceptions import ExtractorError
from spacy.tokens import Token
from pygtrie import CharTrie
from itertools import *
from functools import reduce


[docs]class GlossaryExtractor(Extractor):
    """
    **Description**
        This class takes a list of glossary as reference, extract the matched ngrams string
        from the tokenized input string.

    Examples:
        ::

            glossary = ['Beijing', 'Los Angeles', 'New York', 'Shanghai']
            glossary_extractor = GlossaryExtractor(glossary=glossary,
                                                  ngrams=3,
                                                  case_sensitive=True)
            glossary_extractor.extract(tokens=Tokenizer(input_text))

    """
    def __init__(self,
                 glossary: List[str],
                 extractor_name: str,
                 tokenizer: Tokenizer,
                 ngrams: int = 2,
                 case_sensitive=False) -> None:
        Extractor.__init__(self,
                           input_type=InputType.TOKENS,
                           category="glossary",
                           name=extractor_name)

        self._case_sensitive = case_sensitive
        self._default_tokenizer = tokenizer
        if not ngrams:
            ngrams = 0
            for word in glossary:
                ngrams = max(ngrams, len(self._default_tokenizer.tokenize(word)))
        self._ngrams = min(ngrams, 5)
        self._joiner = " "
        self._glossary = self._populate_trie(glossary)

[docs]    def extract(self, tokens: List[Token]) -> List[Extraction]:
        """
        Extracts information from a string(TEXT) with the GlossaryExtractor instance

        Args:
            token (List[Token]): list of spaCy token to be processed.

        Returns:
            List[Extraction]: the list of extraction or the empty list if there are no matches.

        """
        results = list()

        if len(tokens) > 0:
            if self._case_sensitive:
                new_tokens = [x.orth_ for x in tokens]
            else:
                new_tokens = [x.lower_ for x in tokens]
        else:
            return results

        try:
            ngrams_iter = self._generate_ngrams_with_context(new_tokens)
            results.extend(map(lambda term: self._wrap_value_with_context(tokens, term[1], term[2]),
                               filter(lambda term: isinstance(term[0], str),
                                      map(lambda term: (self._glossary.get(term[0]), term[1], term[2]),
                                          map(lambda term: (
                                          self._combine_ngrams(term[0], self._joiner), term[1], term[2]), ngrams_iter)))))
        except Exception as e:
            raise ExtractorError('GlossaryExtractor: Failed to extract with ' + self.name + '. Catch ' + str(e) + '. ')
        return results

    def _generate_ngrams_with_context(self, tokens: List[Token]) -> chain:
        """Generates the 1-gram to n-grams tuples of the list of tokens"""
        chained_ngrams_iter = self._generate_ngrams_with_context_helper(iter(tokens), 1)
        for n in range(2, self._ngrams + 1):
            ngrams_iter = tee(tokens, n)
            for j in range(1, n):
                for k in range(j):
                    next(ngrams_iter[j], None)
            ngrams_iter_with_context = self._generate_ngrams_with_context_helper(zip(*ngrams_iter), n)
            chained_ngrams_iter = chain(chained_ngrams_iter, ngrams_iter_with_context)
        return chained_ngrams_iter

    def _populate_trie(self, values: List[str]) -> CharTrie:
        """Takes a list and inserts its elements into a new trie and returns it"""
        return reduce(self._populate_trie_reducer, iter(values), CharTrie())

    def _populate_trie_reducer(self, trie_accumulator=CharTrie(), value="") -> CharTrie:
        """Adds value to trie accumulator"""
        if self._case_sensitive:
            key = self._joiner.join([x.orth_ for x in self._default_tokenizer.tokenize(value)])
        else:
            key = self._joiner.join([x.lower_ for x in self._default_tokenizer.tokenize(value)])
        trie_accumulator[key] = value
        return trie_accumulator

    def _wrap_value_with_context(self, tokens: List[Token], start: int, end: int) -> Extraction:
        """Wraps the final result"""
        return Extraction(' '.join([x.orth_ for x in tokens[start:end]]),
                          self.name,
                          start_token=start,
                          end_token=end,
                          start_char=tokens[start].idx,
                          end_char=tokens[end - 1].idx + len(tokens[end - 1].orth_)
                          )

    @staticmethod
    def _generate_ngrams_with_context_helper(ngrams_iter: iter, ngrams_len: int) -> map:
        """Updates the end index"""
        return map(lambda term: (term[1], term[0], term[0] + ngrams_len), enumerate(ngrams_iter))

    @staticmethod
    def _combine_ngrams(ngrams, joiner) -> str:
        """Construct keys for checking in trie"""
        if isinstance(ngrams, str):
            return ngrams
        else:
            combined = joiner.join(ngrams)
            return combined
Source code for etk.extractors.glossary_extractor

ETK

Navigation