Source code for etk.extractors.glossary_extractor
from warnings import warn
from typing import List
from etk.extractor import Extractor, InputType
from etk.extraction import Extraction
from etk.tokenizer import Tokenizer
from etk.etk_exceptions import ExtractorError
from spacy.tokens import Token
from pygtrie import CharTrie
from itertools import *
from functools import reduce
[docs]class GlossaryExtractor(Extractor):
"""
**Description**
This class takes a list of glossary as reference, extract the matched ngrams string
from the tokenized input string.
Examples:
::
glossary = ['Beijing', 'Los Angeles', 'New York', 'Shanghai']
glossary_extractor = GlossaryExtractor(glossary=glossary,
ngrams=3,
case_sensitive=True)
glossary_extractor.extract(tokens=Tokenizer(input_text))
"""
def __init__(self,
glossary: List[str],
extractor_name: str,
tokenizer: Tokenizer,
ngrams: int = 2,
case_sensitive=False) -> None:
Extractor.__init__(self,
input_type=InputType.TOKENS,
category="glossary",
name=extractor_name)
self._case_sensitive = case_sensitive
self._default_tokenizer = tokenizer
if not ngrams:
ngrams = 0
for word in glossary:
ngrams = max(ngrams, len(self._default_tokenizer.tokenize(word)))
self._ngrams = min(ngrams, 5)
self._joiner = " "
self._glossary = self._populate_trie(glossary)
[docs] def extract(self, tokens: List[Token]) -> List[Extraction]:
"""
Extracts information from a string(TEXT) with the GlossaryExtractor instance
Args:
token (List[Token]): list of spaCy token to be processed.
Returns:
List[Extraction]: the list of extraction or the empty list if there are no matches.
"""
results = list()
if len(tokens) > 0:
if self._case_sensitive:
new_tokens = [x.orth_ for x in tokens]
else:
new_tokens = [x.lower_ for x in tokens]
else:
return results
try:
ngrams_iter = self._generate_ngrams_with_context(new_tokens)
results.extend(map(lambda term: self._wrap_value_with_context(tokens, term[1], term[2]),
filter(lambda term: isinstance(term[0], str),
map(lambda term: (self._glossary.get(term[0]), term[1], term[2]),
map(lambda term: (
self._combine_ngrams(term[0], self._joiner), term[1], term[2]), ngrams_iter)))))
except Exception as e:
raise ExtractorError('GlossaryExtractor: Failed to extract with ' + self.name + '. Catch ' + str(e) + '. ')
return results
def _generate_ngrams_with_context(self, tokens: List[Token]) -> chain:
"""Generates the 1-gram to n-grams tuples of the list of tokens"""
chained_ngrams_iter = self._generate_ngrams_with_context_helper(iter(tokens), 1)
for n in range(2, self._ngrams + 1):
ngrams_iter = tee(tokens, n)
for j in range(1, n):
for k in range(j):
next(ngrams_iter[j], None)
ngrams_iter_with_context = self._generate_ngrams_with_context_helper(zip(*ngrams_iter), n)
chained_ngrams_iter = chain(chained_ngrams_iter, ngrams_iter_with_context)
return chained_ngrams_iter
def _populate_trie(self, values: List[str]) -> CharTrie:
"""Takes a list and inserts its elements into a new trie and returns it"""
return reduce(self._populate_trie_reducer, iter(values), CharTrie())
def _populate_trie_reducer(self, trie_accumulator=CharTrie(), value="") -> CharTrie:
"""Adds value to trie accumulator"""
if self._case_sensitive:
key = self._joiner.join([x.orth_ for x in self._default_tokenizer.tokenize(value)])
else:
key = self._joiner.join([x.lower_ for x in self._default_tokenizer.tokenize(value)])
trie_accumulator[key] = value
return trie_accumulator
def _wrap_value_with_context(self, tokens: List[Token], start: int, end: int) -> Extraction:
"""Wraps the final result"""
return Extraction(' '.join([x.orth_ for x in tokens[start:end]]),
self.name,
start_token=start,
end_token=end,
start_char=tokens[start].idx,
end_char=tokens[end - 1].idx + len(tokens[end - 1].orth_)
)
@staticmethod
def _generate_ngrams_with_context_helper(ngrams_iter: iter, ngrams_len: int) -> map:
"""Updates the end index"""
return map(lambda term: (term[1], term[0], term[0] + ngrams_len), enumerate(ngrams_iter))
@staticmethod
def _combine_ngrams(ngrams, joiner) -> str:
"""Construct keys for checking in trie"""
if isinstance(ngrams, str):
return ngrams
else:
combined = joiner.join(ngrams)
return combined