Source code for etk.extractors.dbpedia_spotlight_extractor
from SPARQLWrapper import SPARQLWrapper, JSON
from collections import OrderedDict
from etk.extractor import Extractor, InputType
from etk.extraction import Extraction
from typing import List
import requests
[docs]class DBpediaSpotlightExtractor(Extractor):
"""
**Description**
This extractor takes a string of text as input, uses DBPedia API to annotate words and phrases in the text input.
Examples:
::
dbpedia_spotlight_extractor = DBpediaSpotlightExtractor(search_url='http://model.dbpedia-spotlight.org/en/annotate',
get_attr=False,
get_attr_url="http://dbpedia.org/sparql")
dbpedia_spotlight_extractor.extract(text=input_doc,
filter=['Person', 'Place', 'Organisation'])
"""
def __init__(self, extractor_name: str, search_url: str, get_attr=False,
get_attr_url="http://dbpedia.org/sparql"):
Extractor.__init__(self, input_type=InputType.TEXT,
category="built_in_extractor",
name=extractor_name)
self._search_url = search_url
self._get_attr = get_attr
self._get_attr_url = get_attr_url
[docs] def extract(self, text: str, confidence=0.5, filter=['Person', 'Place', 'Organisation']) -> List[Extraction]:
"""
Extract with the input text, confidence and fields filter to be used.
Args:
text (str): text input to be annotated
confidence (float): the confidence of the annotation
filter (List[str]): the fields that to be extracted
Returns:
List[Extraction]
"""
filter = ','.join(filter)
search_data = [('confidence', confidence),
('text', text),
('types', filter)]
search_headers = {'Accept': 'application/json'}
r = requests.post(self._search_url,
data=search_data,
headers=search_headers)
results = r.json()
last_results = self._combiner(results)
return last_results
def _combiner(self, results: dict) -> List[Extraction]:
return_result = list()
if "Resources" in results:
resources_results = results["Resources"]
for one_result in resources_results:
types = one_result['@types'].split(',')
values = {'surface_form': one_result['@surfaceForm'],
'uri': one_result['@URI'],
'types': types,
'similarity_scores': float(one_result['@similarityScore'])}
if self._get_attr:
attr = self._attr_finder(one_result['@URI'])
values['attributes'] = attr
return_result.append(Extraction(confidence=float(results['@confidence']),
extractor_name=self.name,
start_char=int(one_result['@offset']),
end_char=int(one_result['@offset']) + len(
one_result['@surfaceForm']),
value=values))
return return_result
return list()
def _attr_finder(self, uri) -> dict:
sparql = SPARQLWrapper(self._get_attr_url)
sparql.setQuery("SELECT distinct * WHERE {<" + uri + "> ?link ?resource}")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
attr = OrderedDict()
cnt_attr = 0
for one_item in results['results']['bindings']:
if ('xml:lang' in one_item['resource']) and (one_item['resource']['xml:lang'] != 'en'):
pass
else:
attr_key = one_item['link']['value'].split('/')[-1]
attr_val = one_item['resource']['value']
if attr_key not in attr:
if cnt_attr < 100:
attr[attr_key] = [attr_val]
cnt_attr += 1
else:
attr[attr_key].append(attr_val)
return attr