# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import traceback
import requests
from elasticsearch import Elasticsearch
from nlp_architect.data.cdc_resources.data_types.wiki.wikipedia_page import WikipediaPage
from nlp_architect.data.cdc_resources.data_types.wiki.wikipedia_page_extracted_relations import \
WikipediaPageExtractedRelations
from nlp_architect.data.cdc_resources.wikipedia.wiki_search_page_result import \
WikipediaSearchPageResult
[docs]class WikiElastic(object):
def __init__(self, host: str, port: int, index: str):
# connect to our cluster
self.cache = dict()
if self.is_connected(host, port):
self.es_index = index
self.es = Elasticsearch([{'host': host, 'port': port}])
else:
traceback.print_exc()
raise IOError('Cannot connect to ElasticSearch node')
[docs] @staticmethod
def is_connected(elastic_host, elastic_port):
elastic_search_url = 'http://' + elastic_host + ':' + str(elastic_port)
res = requests.get(elastic_search_url)
if res.content:
return True
return False
[docs] def get_pages(self, phrase):
if phrase in self.cache:
return self.cache[phrase]
try:
phrase_strip = ' '.join(phrase.replace('-', ' ').split())
pages = set()
best_results = self.get_best_elastic_results(phrase_strip)
for result in best_results:
_id = result['_id']
if _id != 0:
result_source = result['_source']
if 'redirectTitle' in result_source:
redirect_title = result_source['redirectTitle']
red_result = None
while redirect_title and result_source['title'] != redirect_title:
red_result = self.get_redirect_result(redirect_title)
if red_result is None or len(red_result) == 0:
print('could not find redirect title=' + redirect_title
+ ', does not exist in data')
redirect_title = None
elif 'redirectTitle' in red_result[0]['_source']:
redirect_title = red_result[0]['_source']['redirectTitle']
else:
redirect_title = None
if red_result is not None and len(red_result) > 0:
result = red_result[0]
_id = result['_id']
elastic_page_result = self.get_page_from_result_v1(phrase_strip, result, _id)
pages.add(WikipediaSearchPageResult(phrase, elastic_page_result))
self.cache[phrase] = pages
return pages
except Exception:
traceback.print_exc()
[docs] def get_best_elastic_results(self, phrase):
best_results = []
best_results.extend(self.get_redirect_result(phrase))
search_result_near_match = self.es.search(index=self.es_index,
body={"size": 5,
'query': {'match_phrase': {
'title.near_match': phrase}}})
best_results.extend(self.extract_from_elastic_results(search_result_near_match))
return best_results
[docs] def get_redirect_result(self, phrase):
search_result = self.es.search(index=self.es_index,
body={"size": 5,
'query': {'match_phrase': {'title.keyword': phrase}}})
results = self.extract_from_elastic_results(search_result)
return results
[docs] def get_page_from_result_v1(self, phrase, result, result_id):
if result_id != 0 and result is not None:
relations = None
result_source = result['_source']
result_score = result['_score']
if result_source is not None:
title = result_source['title']
relations_source = result_source['relations']
if relations_source is not None:
is_part = relations_source['isPartName']
is_disambig = relations_source['isDisambiguation']
disambig_links = self.safe_extract_field_from_dict('disambiguationLinks',
relations_source)
disambig_links_norm = self.safe_extract_field_from_dict(
'disambiguationLinksNorm', relations_source)
categories = self.safe_extract_field_from_dict('categories', relations_source)
categories_norm = self.safe_extract_field_from_dict('categoriesNorm',
relations_source)
title_parent = self.safe_extract_field_from_dict('titleParenthesis',
relations_source)
title_parent_norm = self.safe_extract_field_from_dict('titleParenthesisNorm',
relations_source)
be_comp = self.safe_extract_field_from_dict('beCompRelations',
relations_source)
be_comp_norm = self.safe_extract_field_from_dict('beCompRelationsNorm',
relations_source)
relations = WikipediaPageExtractedRelations(
is_part, is_disambig, title_parent, disambig_links,
categories, None, be_comp, disambig_links_norm, categories_norm, None,
title_parent_norm,
be_comp_norm
)
return WikipediaPage(phrase, None, title, None, result_score, result_id, None,
relations)
return WikipediaPage()