Source code for nlp_architect.data.cdc_resources.data_types.wiki.wikipedia_page_extracted_relations
# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import re
import string
from typing import Set, Dict
from nlp_architect.utils.string_utils import StringUtils
PART_NAME_CATEGORIES = ['name', 'given name', 'surname']
DISAMBIGUATION_TITLE = '(disambiguation)'
DISAMBIGUATION_CATEGORY = ['disambig', 'disambiguation']
[docs]class WikipediaPageExtractedRelations(object):
def __init__(self, is_part_name: bool = False, is_disambiguation: bool = False,
parenthesis: Set[str] = None,
disambiguation_links: Set[str] = None, categories: Set[str] = None,
aliases: Set[str] = None,
be_comp: Set[str] = None,
disambiguation_links_norm: Set[str] = None, categories_norm: Set[str] = None,
aliases_norm: Set[str] = None,
title_parenthesis_norm: Set[str] = None, be_comp_norm: Set[str] = None) -> None:
"""
Object represent a Wikipedia Relations Schema
Args:
is_part_name (bool): Weather page title is part of a Name (ie-family name/given name..)
is_disambiguation (bool): Weather page is a disambiguation page
parenthesis (set): a set of all parenthesis links/titles
disambiguation_links (set): a set of all disambiguation links/titles
categories (set): a set of all category links/titles
aliases (set): a set of all aliases links/titles
be_comp (set): a set of all "is a" links/titles
disambiguation_links_norm (set): same as disambiguation_link just normalized
categories_norm (set): same as categories just normalized, lower and clean
aliases_norm (set): same as aliases just normalized, lower and clean
title_parenthesis_norm (set): same as parenthesis just normalized, lower and clean
be_comp_norm (set): same as be_comp just normalized, lower and clean
"""
self.is_part_name = is_part_name
self.is_disambiguation = is_disambiguation
self.disambiguation_links = disambiguation_links
self.title_parenthesis = parenthesis
self.categories = categories
self.aliases = aliases
self.be_comp = be_comp
self.disambiguation_links_norm = disambiguation_links_norm
self.categories_norm = categories_norm
self.aliases_norm = aliases_norm
self.title_parenthesis_norm = title_parenthesis_norm
self.be_comp_norm = be_comp_norm
[docs] def extract_relations_from_text_v0(self, text):
self.disambiguation_links = set()
self.categories = set()
self.title_parenthesis = set()
self.disambiguation_links_norm = set()
self.categories_norm = set()
self.title_parenthesis_norm = set()
self.be_comp_norm = set()
ext_links = set()
title_parenthesis = set()
text_lines = text.split('\n')
for line in text_lines:
cat_links = self.extract_categories(line)
if not self.is_part_name:
self.is_part_name = self.is_name_part(line)
if not self.is_part_name and [s for s in PART_NAME_CATEGORIES if s in cat_links]:
self.is_part_name = True
self.categories.update(cat_links)
self.categories_norm.update(StringUtils.normalize_string_list(cat_links))
links, parenthesis_links = self.extract_links_and_parenthesis(line)
ext_links.update(links)
title_parenthesis.update(parenthesis_links)
if self.is_disambiguation:
self.disambiguation_links = ext_links
self.disambiguation_links_norm = StringUtils.normalize_string_list(ext_links)
self.title_parenthesis = title_parenthesis
self.title_parenthesis_norm = StringUtils.normalize_string_list(title_parenthesis)
def __str__(self) -> str:
return str(self.is_disambiguation) + ', ' + str(self.is_part_name) + ', ' + \
str(self.disambiguation_links) + ', ' + str(self.be_comp) + ', ' + str(
self.title_parenthesis) + ', ' + str(self.categories)
[docs] def toJson(self) -> Dict:
result_dict = dict()
result_dict['isPartName'] = self.is_part_name
result_dict['isDisambiguation'] = self.is_disambiguation
if self.disambiguation_links is not None:
result_dict['disambiguationLinks'] = list(self.disambiguation_links)
result_dict['disambiguationLinksNorm'] = list(self.disambiguation_links_norm)
if self.categories is not None:
result_dict['categories'] = list(self.categories)
result_dict['categoriesNorm'] = list(self.categories_norm)
if self.aliases is not None:
result_dict['aliases'] = list(self.aliases)
if self.title_parenthesis is not None:
result_dict['titleParenthesis'] = list(self.title_parenthesis)
result_dict['titleParenthesisNorm'] = list(self.title_parenthesis_norm)
if self.be_comp_norm is not None:
result_dict['beCompRelations'] = list(self.be_comp)
result_dict['beCompRelationsNorm'] = list(self.be_comp_norm)
return result_dict
[docs] @staticmethod
def extract_categories(line: str) -> Set[str]:
categories = set()
category_form1 = re.findall(r'\[\[Category:(.*)\]\]', line)
for cat in category_form1:
if DISAMBIGUATION_TITLE in cat:
cat = cat.replace(DISAMBIGUATION_TITLE, '')
categories.add(cat)
prog = re.search('^{{(disambig.*|Disambig.*)}}$', line)
if prog is not None:
category_form2 = prog.group(1)
cats = category_form2.split('|')
categories.update(cats)
return categories
[docs] @staticmethod
def extract_links_and_parenthesis(line: str):
links = set()
parenthesis_links = set()
ext_links = re.findall(r'\[\[(.*)\]\]', line)
for link in ext_links:
split_link = link.split('|')
for s_link in split_link:
parenthesis_clean = None
matcher = re.match(r'(.*)\s?\((.*)\)', s_link)
if matcher:
s_link = matcher.group(1)
parenthesis_match = matcher.group(2)
if parenthesis_match.lower() != 'disambiguation':
parenthesis_clean = re.sub(
'[' + string.punctuation + string.whitespace + ']', ' ',
parenthesis_match).strip()
s_link_clean = re.sub('[' + string.punctuation + string.whitespace + ']', ' ',
s_link).strip()
if parenthesis_clean is not None and DISAMBIGUATION_TITLE not in parenthesis_clean:
parenthesis_links.add(parenthesis_clean)
links.add(s_link_clean)
return links, parenthesis_links
[docs] @staticmethod
def is_name_part(line: str) -> bool:
line = line.lower()
val = False
if WikipediaPageExtractedRelations.find_in_line(line, '===as surname==='):
val = True
elif WikipediaPageExtractedRelations.find_in_line(line, '===as given name==='):
val = True
elif WikipediaPageExtractedRelations.find_in_line(line, '===given names==='):
val = True
elif WikipediaPageExtractedRelations.find_in_line(line, '==as a surname=='):
val = True
elif WikipediaPageExtractedRelations.find_in_line(line, '==people with the surname=='):
val = True
elif WikipediaPageExtractedRelations.find_in_line(line, '==family name and surname=='):
val = True
elif WikipediaPageExtractedRelations.find_in_line(line, 'category:given names'):
val = True
elif WikipediaPageExtractedRelations.find_in_line(line, '{{given name}}'):
val = True
return val
[docs] @staticmethod
def find_in_line(text: str, pattern: str) -> bool:
found = re.findall(pattern, text)
if found:
return True
return False