Source code for etk.extractors.decoding_value_extractor
from typing import List
from etk.extractor import Extractor, InputType
from etk.extraction import Extraction
from etk.etk_exceptions import ExtractorError
[docs]class DecodingValueExtractor(Extractor):
"""
**Description**
This class takes a 'decoding_dict' as reference, decoding the input text based on the 'decoding_dict'
Examples:
::
decoding_dict = {
'CA': 'California',
'ny': 'New York',
'AZ': ' Arizona',
' TX ': 'Texas',
' fl': 'Florida',
}
decoding_value_extractor = DecodingValueExtractor(decoding_dict=decoding_dict,
extractor_name='default_decoding',
case_sensitive=True,
strip_key=False,
strip_value=True)
decoding_value_extractor.extract(value=value_to_be_decoded)
"""
def __init__(self,
decoding_dict: dict,
extractor_name: str,
default_action: str='delete',
case_sensitive: bool=False,
strip_key: bool=True,
strip_value: bool=False,
) -> None:
"""
Args:
decoding_dict: dict -> a python dictionary for decoding values
extractor_name: str -> extractor name
default_action: enum['delete'] -> what if the value not matched in dictionary
case_sensitive: bool -> matching the key and value strictly or ignore cases
strip_key: bool -> strip key and value for matching or not
strip_value: bool -> return the striped value if matched or the original value
"""
Extractor.__init__(self,
input_type=InputType.TEXT,
category="dictionary",
name=extractor_name)
if case_sensitive and not strip_key:
self._decoding_dict = decoding_dict
else:
new_dict = {}
if not strip_key: # not case_sensitive, ignore cases
for k in decoding_dict:
new_dict[k.lower()] = decoding_dict[k]
elif case_sensitive: # strip key
for k in decoding_dict:
new_dict[k.strip()] = decoding_dict[k]
else: # ignore case AND strip key
for k in decoding_dict:
new_dict[k.lower().strip()] = decoding_dict[k]
self._decoding_dict = new_dict
self._case_sensitive = case_sensitive
self._default_action = default_action
self._strip_key = strip_key
self._strip_value = strip_value
self._joiner = " "
[docs] def extract(self, value: str) -> List[Extraction]:
"""
Args:
value (str): the value to be decode
Returns:
List[Extraction]: actually a single Extraction wrapped in a list if there is a match
"""
to_match = value.lower() if not self._case_sensitive else value
to_match = to_match.strip() if self._strip_key else to_match
if to_match in self._decoding_dict:
extraction = self._wrap_result(self._decoding_dict[to_match], value)
return [extraction] if extraction else list()
else:
if self._default_action == 'delete':
return list()
return list()
def _wrap_result(self, value: str, original_key: str) -> Extraction or None:
"""
Args:
value: the decoded value
original_key: the original string value to be decode
Returns: an Extraction if everything goes well
"""
try:
value = value.strip() if self._strip_value else value
e = Extraction(value, self.name, start_char=0, end_char=len(str(value)))
return e
except Exception as e:
print('fail to wrap dictionary extraction: ', original_key, value)
raise ExtractorError('Exception: ' + str(e))