Source code for nlp_architect.data.cdc_resources.wikipedia.wiki_online
# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import os
import logging
import re
from nlp_architect.data.cdc_resources.data_types.wiki.wikipedia_page import WikipediaPage
from nlp_architect.data.cdc_resources.data_types.wiki.wikipedia_page_extracted_relations import \
WikipediaPageExtractedRelations
from nlp_architect.data.cdc_resources.wikipedia.wiki_search_page_result import \
WikipediaSearchPageResult
from nlp_architect.utils.text import SpacyInstance
os.environ['PYWIKIBOT_NO_USER_CONFIG'] = '1'
DISAMBIGUATE_PAGE = ['wikimedia disambiguation page', 'wikipedia disambiguation page']
NAME_DESCRIPTIONS = ['given name', 'first name', 'family name']
logger = logging.getLogger(__name__)
[docs]class WikiOnline(object):
def __init__(self):
import pywikibot
self.spacy = SpacyInstance()
self.pywikibot = pywikibot
self.cache = dict()
self.site = pywikibot.Site('en', 'wikipedia') # The site we want to run our bot on
[docs] def get_pages(self, phrase):
if phrase in self.cache:
return self.cache[phrase]
ret_pages = set()
word_clean = phrase.replace('-', ' ')
word_lower = word_clean.lower()
word_upper = word_clean.upper()
word_title = word_clean.title()
words_set = {phrase, word_clean, word_lower, word_upper, word_title}
for appr in words_set:
try:
page_result = self.get_page_redirect(appr)
if page_result.pageid != 0:
full_page = self.get_wiki_page_with_items(phrase, page_result)
ret_pages.add(WikipediaSearchPageResult(appr, full_page))
except Exception as e:
logger.error(e)
self.cache[phrase] = ret_pages
return ret_pages
# pylint: disable=protected-access
[docs] def get_wiki_page_with_items(self, phrase, page):
item = self.get_wiki_page_item(page)
pageid = page.pageid
aliases = self.get_aliases(item)
description = self.get_description(item)
text = page.text
page_title = page._link._title
relations = WikipediaPageExtractedRelations()
relations.is_disambiguation = self.is_disambiguation_page(item)
relations.is_part_name = self.is_name_description(text, item, relations.is_disambiguation)
relations.aliases = aliases
relations.be_comp, relations.be_comp_norm = self.extract_be_comp(text)
relations.extract_relations_from_text_v0(text)
ret_page = WikipediaPage(phrase, None, page_title, None, 0, pageid, description, relations)
logger.debug('Page: {}. Extracted successfully'.format(ret_page))
return ret_page
[docs] def get_wiki_page_item(self, page):
if page is not None:
try:
item = self.pywikibot.ItemPage.fromPage(
page) # this can be used for any page object
item.get() # need to call it to access any data.
return item
except (self.pywikibot.NoPage, AttributeError, TypeError, NameError):
pass
return None
[docs] def get_page_redirect(self, word):
page = self.pywikibot.Page(self.site, word)
if page.pageid != 0 and page.isRedirectPage():
return page.getRedirectTarget()
return page
[docs] @staticmethod
def get_aliases(item):
if item is not None and item.aliases is not None:
if 'en' in item.aliases:
aliases = item.aliases['en']
return aliases
return None
[docs] @staticmethod
def get_description(item):
description = {}
if item is not None:
item_desc = item.get()
if 'desctiptions' in item_desc and 'en' in item_desc['descriptions']:
dict([("age", 25)])
description['descriptions'] = dict([('en', item_desc['descriptions']['en'])])
return description
[docs] @staticmethod
def is_disambiguation_page(item):
if item is not None:
dic = item.get()
if dic is not None and 'descriptions' in dic:
desc = dic['descriptions']
if desc is not None and 'en' in desc:
return desc['en'].lower()in DISAMBIGUATE_PAGE
return False
[docs] @staticmethod
def is_name_description(text, item, is_disambiguation):
if item is not None:
if is_disambiguation:
if WikipediaPageExtractedRelations.is_name_part(text):
return True
else:
dic = item.get()
if dic is not None and 'descriptions' in dic:
desc = dic['descriptions']
if desc is not None and 'en' in desc:
if [s for s in NAME_DESCRIPTIONS if s in desc['en'].lower()]:
return True
return False
# pylint: disable=no-else-return
[docs] def extract_be_comp(self, text):
first_sentence_start_index = text.index("'''")
if first_sentence_start_index >= 0:
last_temp_index = text.find('\n', first_sentence_start_index)
if last_temp_index == -1:
last_temp_index = len(text)
first_paragraph = text[first_sentence_start_index:last_temp_index]
if WikiOnline.extract_be_a_index(first_paragraph) == -1 and last_temp_index != len(text):
return self.extract_be_comp(text[last_temp_index:])
elif last_temp_index == len(text):
return None, None
first_paragraph_clean = re.sub(r'\([^)]*\)', '', first_paragraph)
first_paragraph_clean = re.sub(r'<[^>]*>', '', first_paragraph_clean)
first_paragraph_clean = re.sub(r'{[^}]*}', '', first_paragraph_clean)
first_paragraph_clean = re.sub(r'\[\[[^]]*\]\]', '', first_paragraph_clean)
first_paragraph_clean = re.sub(r'[\']', '', first_paragraph_clean)
first_paragraph_clean = re.sub(r' ', ' ', first_paragraph_clean)
return self.extract_be_comp_relations(first_paragraph_clean)
# pylint: disable=not-callable
[docs] def extract_be_comp_relations(self, first_paragraph):
be_comp = set()
be_comp_norm = set()
if first_paragraph:
doc = self.spacy.parser(first_paragraph)
for token in doc:
target = token.text
target_lemma = token.lemma_
relation = token.dep_
governor = token.head.text
governor_lemma = token.head.lemma_
if relation == 'acl':
break
if relation == 'punct' and target == '.':
break
elif relation == 'cop':
be_comp.add(governor)
be_comp_norm.add(governor_lemma)
elif relation == 'nsubj':
be_comp.add(target)
be_comp_norm.add(target_lemma)
elif relation == 'dep':
be_comp.add(governor)
be_comp_norm.add(governor_lemma)
elif relation == 'compound':
be_comp.add(target + ' ' + governor)
be_comp_norm.add(target_lemma + ' ' + governor_lemma)
elif relation == 'amod':
be_comp.add(target + ' ' + governor)
be_comp_norm.add(target_lemma + ' ' + governor_lemma)
elif relation in ['conj', 'appos']:
be_comp.add(target)
be_comp_norm.add(target_lemma)
return be_comp, be_comp_norm
[docs] @staticmethod
def extract_be_a_index(sentence):
result = None
if 'is a' in sentence:
result = sentence.index("is a")
elif 'are a' in sentence:
result = sentence.index("are a")
elif 'was a' in sentence:
result = sentence.index("was a")
elif 'were a' in sentence:
result = sentence.index("were a")
elif 'be a' in sentence:
result = sentence.index("be a")
elif 'is the' in sentence:
result = sentence.index("is the")
elif 'are the' in sentence:
result = sentence.index("are the")
elif 'was the' in sentence:
result = sentence.index("was the")
elif 'were the' in sentence:
result = sentence.index("were the")
elif 'be the' in sentence:
result = sentence.index("be the")
return result