Source code for etk.extractors.hostname_extractor
from etk.extractors.regex_extractor import RegexExtractor
[docs]class HostnameExtractor(RegexExtractor):
"""
**Description**
This class inherits RegexExtractor by predefining the regex pattern for hostname
Examples:
::
hostname_extractor = HostnameExtractor()
hostname_extractor.extract(text=input_doc)
"""
def __init__(self):
hostname_pattern = r"\b(?:[a-zA-Z0-9](?:[a-zA-Z0-9\-]{,61}[a-zA-Z0-9])?\.)+" \
r"(?!html|php|jsp|xml|pdf|asp|css|aspx|phtml)[a-zA-Z]{2,6}\b"
RegexExtractor.__init__(self, pattern=hostname_pattern, extractor_name="hostname extractor")