Source code for etk.extractors.html_content_extractor

from typing import List
from enum import Enum, auto
from bs4 import BeautifulSoup
from bs4.element import Comment
from etk.extractor import Extractor, InputType
from etk.extraction import Extraction
from etk.extractors.readability.readability import Document


class Strategy(Enum):
    """
    ALL_TEXT: return all visible text in an HTML page
    MAIN_CONTENT_STRICT: MAIN_CONTENT_STRICT: return the main content of the page without boiler plate (menu, ads...)
    MAIN_CONTENT_RELAXED: variant of MAIN_CONTENT_STRICT with less strict rules
    """
    ALL_TEXT = auto()
    MAIN_CONTENT_STRICT = auto()
    MAIN_CONTENT_RELAXED = auto()


[docs]class HTMLContentExtractor(Extractor): """ **Description** This class extracts text from HTML pages. Uses readability and BeautifulSoup. Examples: :: html_content_extractor = HTMLContentExtractor() html_content_extractor.extract(text=input_doc, strategy=Strategy.ALL_TEXT) """ def __init__(self): Extractor.__init__(self, input_type=InputType.HTML, category="HTML extractor", name="HTML content extractor")
[docs] def extract(self, html_text: str, strategy: Strategy=Strategy.ALL_TEXT) \ -> List[Extraction]: """ Extracts text from an HTML page using a variety of strategies Args: html_text (str): html page in string strategy (enum[Strategy.ALL_TEXT, Strategy.MAIN_CONTENT_RELAXED, Strategy.MAIN_CONTENT_STRICT]): one of Strategy.ALL_TEXT, Strategy.MAIN_CONTENT_STRICT and Strategy.MAIN_CONTENT_RELAXED Returns: List[Extraction]: typically a singleton list with the extracted text """ if html_text: if strategy == Strategy.ALL_TEXT: soup = BeautifulSoup(html_text, 'html.parser') texts = soup.findAll(text=True) visible_texts = filter(self._tag_visible, texts) all_text = u" ".join(t.strip() for t in visible_texts) return [Extraction(all_text, self.name)] else: relax = strategy == Strategy.MAIN_CONTENT_RELAXED readable = Document(html_text, recallPriority=relax).summary(html_partial=False) clean_text = BeautifulSoup(readable.encode('utf-8'), 'lxml').strings readability_text = ' '.join(clean_text) return [Extraction(readability_text, self.name)] else: return []
@staticmethod def _tag_visible(element): if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']: return False if isinstance(element, Comment): return False return True