# -*- coding: utf-8 -*-
"""
Preprocess the data
"""
#######################################
import re
import logging
import io
import spacy
import gensim
from sptm.utils import force_unicode
__author__ = "Rochan Avlur Venkat"
__credits__ = ["Anupam Mediratta"]
__license__ = "MIT"
__version__ = "1.0"
__maintainer__ = "Rochan Avlur Venkat"
__email__ = "rochan170543@mechyd.ac.in"
#######################################
# Setup logging for gensim
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', \
level=logging.INFO)
# Setup spacy model for custom preprocessing of data used in tokenize_custom()
NLP = spacy.load('en_core_web_sm')
[docs]class Corpus:
"""Corpus object to handle all pre-processing of data
read_reviews() method assumes data in the file to be in the following
format:
<metadata></t>...</t><data_in_multiple_sentences>
Data to be preprocessed must be in the last column.
Attributes:
path: path to the data file
raw_review: data read from the file in a list
sentences: list of lists containing data number and
sentence
tokens: list of lists of data index number followed by tokenized
sentence
"""
def __init__(self, path=None, raw_review=None, sentences=None,\
tokens=None):
"""Inits Corpus with path, raw_review, sentences and tokens if passed
One can directly pass semi processed data at different stages and use
methods provided in the class to complete the preprocessing.
Args:
path: Path to data file
raw_review: data in a list
sentences: list of lists containing data index number and sentences
tokens: list of lists of data index number followed by tokenized
sentence
"""
self.path = path
self.raw_review = raw_review if (raw_review is not None) else []
self.sentences = sentences if (sentences is not None) else []
self.tokens = tokens if (tokens is not None) else []
[docs] def read_reviews(self, delimiter='\t', reg=r"(\\u[0-z][0-z][0-z])\w",\
rep=" "):
"""Read reviews and store it in a list
Args:
delimiter: The seperator between data
reg: Custom regex to filter
rep: String to replace the regex values
Raises:
IOError: file not found
Exception: Data format in the file opened does not follow the
specified template style
"""
# Open the data file
try:
with io.open(self.path, 'rb') as raw:
self.raw_review = [x.strip().split(delimiter) for x in raw]
except IOError:
raise IOError('File not found')
try:
# Select data only
for i, val in enumerate(self.raw_review):
# Take the last column values
self.raw_review[i] = self.raw_review[i][-1]
# Run regex
self.raw_review[i] = re.sub(reg, rep, self.raw_review[i])
except:
raise Exception('Data format in the file does not follow template')
[docs] def split_sentence(self, min_len=2):
"""Split each data index into its individual sentences
Splits data index at periods.
Args:
min_len: Minimum length of a sentence above which to include
"""
# Iterate over every unique data value
for i, val in enumerate(self.raw_review):
# Split the sentences
sentence = self.raw_review[i].split('.')
# Append the other sentences to the end of the self.sentences
for j, v in enumerate(sentence):
# Make sure the sentence has more than <min_len> words
if len(sentence[j]) > min_len:
self.sentences.append([i, sentence[j]])
[docs] def tokenize_simple(self, deacc=False, min_len=2, max_len=15):
"""Processes sentences
Tokenize, ignore tokens that are too small
Args:
deacc: Remove accentuation
min_len: Minimal length of token in result
max_len: Maximum length of token in result
"""
# Simple tokens, de-accent and lowercase processor
for i, val in enumerate(self.sentences):
self.tokens.append([self.sentences[i][0], \
gensim.utils.simple_preprocess(self.sentences[i][1], deacc, \
min_len, max_len)])
[docs] def tokenize_custom(self, min_len=1):
"""Processes sentences
Tokenize, ignore tokens that are too small, lemmatize, filter out
grammar {stop words, symbols, prepositions, numbers etc}
Args:
min_len: Minimum length of tokens
"""
# POS Tagging and filtering sentences
for i, val in enumerate(self.sentences):
doc = NLP(force_unicode(self.sentences[i][1]))
to = [unicode(self.sentences[i][0])]
for tok in doc:
if tok.is_stop != True and tok.pos_ != 'SYM' and \
tok.tag_ != 'PRP' and tok.tag_ != 'PRP$' and \
tok.pos_ != 'NUM' and tok.dep_ != 'aux' and \
tok.dep_ != 'prep' and tok.dep_ != 'det' and \
tok.dep_ != 'cc' and len(tok) != min_len:
to.append(tok.lemma_)
if len(to) > 1:
self.tokens.append(to)
[docs] def write_processed(self, name):
"""Save to file
Appends tokens to given file
Args:
name: Name of file
Raises:
IOError: Path does not exist
Exception: self.tokens structure not supported, manually check its
value
"""
try:
# Write the preprocessed reviews to file
with io.open(name, "a", encoding='utf8') as outfile:
for i, val in enumerate(self.tokens):
outfile.write(unicode(','.join(self.tokens[i]) + "\n"))
except IOError:
raise IOError('Path does not exist')
except Exception:
raise Exception('Error while saving file, check self.tokens value')