Source code for etk.extractors.inferlink_extractor

import json
from typing import List
from etk.extractor import Extractor, InputType
from etk.extraction import Extraction
from etk.etk_exceptions import ExtractorError
from etk.dependencies.landmark.landmark_extractor.extraction.Landmark import ItemRule, IterationRule, loadRule


class InferlinkRule(object):
    """
    Wrapper of a single rule.
    """

    def __init__(self, rule: dict):
        self._name = rule['name'] if 'name' in rule else ''
        self._value = None
        self._start_char = None
        self._end_char = None
        self._rule = loadRule(rule)

    @property
    def name(self) -> str:
        return self._name

    @property
    def value(self) -> str:
        return self._value

    @property
    def start_char(self) -> int:
        return self._start_char

    @property
    def end_char(self) -> int:
        return self._end_char

    def apply(self, html_text: str):
        extraction = self._rule.apply(html_text)  # a dict with 'rule_id', 'extract', 'begin_index', 'end_index'...
        self._value = extraction['extract'] if 'extract' in extraction and extraction['extract'] != '' else None
        self._start_char = extraction['begin_index'] if 'begin_index' in extraction and extraction[
            'extract'] != -1 else None
        self._end_char = extraction['end_index'] if 'end_index' in extraction and extraction['extract'] != -1 else None


class InferlinkRuleSet(object):
    """
    Wrapper class on an inferlink JSON to provide a convenient API to work with rules.
    """

    def __init__(self, rule_set: List[dict]):
        self._rules = [InferlinkRule(rule) for rule in rule_set]

    @property
    def rules(self) -> List[InferlinkRule]:
        return self._rules

    @staticmethod
    def load_rules_file(file_name: str) -> List[dict]:
        with open(file_name, 'r') as f:
            return json.load(f)['rules']


[docs]class InferlinkExtractor(Extractor):
    """
    **Description**
        This class extracts segments from an HTML page using rules created by the Inferlink web wrapper.

    Examples:
        ::

            inferlink_extractor = InferlinkExtractor()
            inferlink_extractor.extract(text=input_doc,
                                        threshold=0.8)
    """

    def __init__(self, rule_set: InferlinkRuleSet):
        Extractor.__init__(self,
                           input_type=InputType.HTML,
                           category="HTML extractor",
                           name="Inferlink extractor")
        self._rule_set = rule_set

[docs]    def extract(self, html_text: str, threshold=0.5) -> List[Extraction]:
        """

        Args:
            html_text (str): str of the html page to be extracted
            threshold (float): if the ratio of rules that successfully extracted something over all rules \
                    is higher than or equal to the threshold, return the results, else return an empty list

        Returns:
            List[Extraction]: a list of Extractions, each extraction includes the extracted value, the rule name, the provenance etc.

        """

        result = list()
        try:
            for rule in self._rule_set.rules:
                rule.apply(html_text)
                value = rule.value
                if value is not None:
                    # note the addition of a new tag argument to Extraction
                    start_char = rule.start_char
                    end_char = rule.end_char
                    result.append(Extraction(value, self.name, start_char=start_char, end_char=end_char, tag=rule.name))

            # Test whether the fraction of extractions meets the desired threshold
            if len(self._rule_set.rules) > 0 and float(len(result)) / len(self._rule_set.rules) >= threshold:
                return result
            else:
                return list()
        except Exception as e:
            raise ExtractorError('Error in extracting landmark %s' % e)
Source code for etk.extractors.inferlink_extractor

ETK

Navigation