Source code for etk.extractors.inferlink_extractor

import json
from typing import List
from etk.extractor import Extractor, InputType
from etk.extraction import Extraction
from etk.etk_exceptions import ExtractorError
from etk.dependencies.landmark.landmark_extractor.extraction.Landmark import ItemRule, IterationRule, loadRule


class InferlinkRule(object):
    """
    Wrapper of a single rule.
    """

    def __init__(self, rule: dict):
        self._name = rule['name'] if 'name' in rule else ''
        self._value = None
        self._start_char = None
        self._end_char = None
        self._rule = loadRule(rule)

    @property
    def name(self) -> str:
        return self._name

    @property
    def value(self) -> str:
        return self._value

    @property
    def start_char(self) -> int:
        return self._start_char

    @property
    def end_char(self) -> int:
        return self._end_char

    def apply(self, html_text: str):
        extraction = self._rule.apply(html_text)  # a dict with 'rule_id', 'extract', 'begin_index', 'end_index'...
        self._value = extraction['extract'] if 'extract' in extraction and extraction['extract'] != '' else None
        self._start_char = extraction['begin_index'] if 'begin_index' in extraction and extraction[
            'extract'] != -1 else None
        self._end_char = extraction['end_index'] if 'end_index' in extraction and extraction['extract'] != -1 else None


class InferlinkRuleSet(object):
    """
    Wrapper class on an inferlink JSON to provide a convenient API to work with rules.
    """

    def __init__(self, rule_set: List[dict]):
        self._rules = [InferlinkRule(rule) for rule in rule_set]

    @property
    def rules(self) -> List[InferlinkRule]:
        return self._rules

    @staticmethod
    def load_rules_file(file_name: str) -> List[dict]:
        with open(file_name, 'r') as f:
            return json.load(f)['rules']


[docs]class InferlinkExtractor(Extractor): """ **Description** This class extracts segments from an HTML page using rules created by the Inferlink web wrapper. Examples: :: inferlink_extractor = InferlinkExtractor() inferlink_extractor.extract(text=input_doc, threshold=0.8) """ def __init__(self, rule_set: InferlinkRuleSet): Extractor.__init__(self, input_type=InputType.HTML, category="HTML extractor", name="Inferlink extractor") self._rule_set = rule_set
[docs] def extract(self, html_text: str, threshold=0.5) -> List[Extraction]: """ Args: html_text (str): str of the html page to be extracted threshold (float): if the ratio of rules that successfully extracted something over all rules \ is higher than or equal to the threshold, return the results, else return an empty list Returns: List[Extraction]: a list of Extractions, each extraction includes the extracted value, the rule name, the provenance etc. """ result = list() try: for rule in self._rule_set.rules: rule.apply(html_text) value = rule.value if value is not None: # note the addition of a new tag argument to Extraction start_char = rule.start_char end_char = rule.end_char result.append(Extraction(value, self.name, start_char=start_char, end_char=end_char, tag=rule.name)) # Test whether the fraction of extractions meets the desired threshold if len(self._rule_set.rules) > 0 and float(len(result)) / len(self._rule_set.rules) >= threshold: return result else: return list() except Exception as e: raise ExtractorError('Error in extracting landmark %s' % e)