Source code for etk.extractors.url_extractor

from etk.extractors.regex_extractor import RegexExtractor


[docs]class URLExtractor(RegexExtractor): """ **Description** This class inherits the RegexExtractor and pre-defines the url pattern as the regex pattern. Example: :: url_extractor = URLExtractor(allow_missing_http=True) url_extractor.extractor(text=text) """ def __init__(self, allow_missing_http: bool=False): if allow_missing_http: # reference: https://gist.github.com/dperini/729294, slightly modified to match _ and allow missing "http" url_pattern = u"(?:(?:https?|ftp)://)?"\ u"(?:\S+(?::\S*)?@)?(?:"\ u"(?!(?:10|127)(?:\.\d{1,3}){3})"\ u"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"\ u"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"\ u"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"\ u"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"\ u"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|"\ u"(?:(?:[a-z\u00a1-\uffff0-9][_-]?)*[a-z\u00a1-\uffff0-9]+)"\ u"(?:\.(?:[a-z\u00a1-\uffff0-9][_-]?)*[a-z\u00a1-\uffff0-9]+)*"\ u"(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/\S*)?" else: # reference: https://gist.github.com/dperini/729294, slightly modified to match _ url_pattern = u"(?:(?:https?|ftp)://)"\ "(?:\S+(?::\S*)?@)?(?:"\ u"(?!(?:10|127)(?:\.\d{1,3}){3})"\ u"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"\ u"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"\ u"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"\ u"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"\ u"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|"\ u"(?:(?:[a-z\u00a1-\uffff0-9][_-]?)*[a-z\u00a1-\uffff0-9]+)"\ u"(?:\.(?:[a-z\u00a1-\uffff0-9][_-]?)*[a-z\u00a1-\uffff0-9]+)*"\ u"(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/\S*)?" RegexExtractor.__init__(self, pattern=url_pattern, extractor_name="url extractor")