# -*- coding: utf-8 -*-
"""
Adjust, train and optimize Model
"""
#######################################
import logging
import codecs
import multiprocessing
from shutil import copy2
import gensim.corpora as corpora
import gensim.models.wrappers as Wrappers
import gensim.utils as utils
from gensim.models import CoherenceModel
from sptm.utils import force_unicode
__author__ = "Rochan Avlur Venkat"
__credits__ = ["Anupam Mediratta"]
__license__ = "MIT"
__version__ = "1.0"
__maintainer__ = "Rochan Avlur Venkat"
__email__ = "rochan170543@mechyd.ac.in"
#######################################
# Setup logging for gensim
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', \
level=logging.INFO)
[docs]class Model:
"""Adjust, train and optimize LDA model
This class is responsible for traning the Topic Model using Mallet's
LDA which can be found [here](http://mallet.cs.umass.edu/topics.php)
Attributes:
mallet_path: Path to Mallet binary
tokens: List of lists containing data index number and tokens
id2word: Dictionary of the Corpus
corpus: Term Document frequency
alpha = Model alpha hyperparameter
workers = Number of workers spawned while training the model
prefix = prefix
optimize_interval = Number of iterations after which to re-evaluate
hyperparameters
iterations = Number of iterations
topic_threshold = topic threshold
num_topics = Number of topics
lda_model_mallet = Gensim Mallet LDA wrapper object
"""
def __init__(self, mallet_path, tokens=None, input_path=None):
"""Inits Model with Mallet path, tokenized data or input path to open
saved tokenized data.
NOTE: If both input_path and tokens is given, tokens will always take
higher preference
Args:
mallet_path: Location of Mallet binary
input_path: Location of saved preprocessed tokens file
tokens: tokens of preprocessed data
Raises:
IOError: Tokens file not found or not in specified format
Exception: Not in specified structure
"""
self.mallet_path = mallet_path
self.tokens = []
if (tokens is not None and input_path is None) or \
(tokens is not None and input_path is not None):
# Use tokens list passed as an argument
print('Using tokens passed as argument')
try:
for i, val in enumerate(tokens):
self.tokens.append(tokens[i][1:])
except:
raise Exception("Tokens list does not follow required " + \
"structure")
elif tokens is None and input_path is not None:
# Read the saved tokens file
print('Opening tokens file')
try:
with codecs.open(input_path, 'r', encoding='utf8') as F:
for row in F:
token_in_row = row.split(",")
for i, val in enumerate(token_in_row):
token_in_row[i] = force_unicode(token_in_row[i])
self.tokens.append(token_in_row[1:])
except IOError:
raise IOError("File not found")
except:
raise Exception("Tokens list does not follow required " + \
"structure")
elif tokens is None and input_path is None:
print("Assuming load model from saved file, use Model.load()")
else:
print("Missing tokens data")
[docs] def fit(self):
"""Generate the id2word dictionary and term document frequency of
the given tokens
NOTE: Should be called only after making sure that the tokens
have been properly read
Raises:
Exception: self.tokens empty or not in required format
"""
try:
# Create Dictionary
self.id2word = corpora.Dictionary(self.tokens)
# Term Document Frequency
self.corpus = \
[self.id2word.doc2bow(text) for text in self.tokens]
except:
raise Exception('tokens not compatible')
[docs] def params(self, alpha=50, workers=multiprocessing.cpu_count(), \
prefix=None, optimize_interval=0, iterations=1000, \
topic_threshold=0.0, num_topics=100):
"""Model parameters
NOTE: These are the same parameters used while traning models
for coherence computation. Call this function to re-initialize
parameter values in that case
Args:
alpha: Alpha value (Dirichlet Hyperparameter)
workers: Number of threads to spawn to parallel traning process
prefix: prefix
optimize_interval: Number of intervals after which to recompute
hyperparameters
iterations: Number of iterations
topic_threshold: Topic threshold
num_topics: Number of topics
"""
self.alpha = alpha
self.workers = workers
self.prefix = prefix
self.optimize_interval = optimize_interval
self.iterations = iterations
self.topic_threshold = topic_threshold
self.num_topics = num_topics
[docs] def train(self):
"""Train LDA Mallet model using gensim's Mallet wrapper
"""
self.lda_model_mallet = Wrappers.LdaMallet(self.mallet_path, \
corpus=self.corpus, num_topics=self.num_topics, \
alpha=self.alpha, id2word=self.id2word, \
workers=self.workers, prefix=self.prefix, \
optimize_interval=self.optimize_interval, \
iterations=self.iterations, \
topic_threshold=self.topic_threshold)
[docs] def topics(self, num_topics=100, num_words=10):
"""Return top <num_words> words for the first <num_topics> topics
Args:
num_topics: Number of topics to print
num_words: Number of top words to print for each topic
Returns:
List of topics and top words
"""
return self.lda_model_mallet.print_topics(num_topics, num_words)
[docs] def save(self, output_path):
"""Save the Mallet lDA model
Also, save the document_topic distribution, corpus and inferencer
Args:
output_path: Location with filename to save the LDA model
Raises:
IOError: Error with output_path / File already exists
"""
doctopic = self.lda_model_mallet.fdoctopics()
inferencer = self.lda_model_mallet.finferencer()
corpus = self.lda_model_mallet.fcorpusmallet()
try:
copy2(doctopic, output_path + "_doctopic")
copy2(inferencer, output_path + "_inferencer")
copy2(corpus, output_path + "_corpus")
except:
raise IOError('Error with output path / File already exists')
self.lda_model_mallet.save(output_path)
[docs] def get_coherence(self):
"""Compute Coherence Score of the model
NOTE: You cannot compute the coherence score of a saved model
Returns:
Float value
"""
coherence_model_lda = CoherenceModel(model=self.lda_model_mallet, \
texts=self.tokens, dictionary=self.id2word, \
coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
return coherence_lda
[docs] def optimum_topic(self, start=10, limit=100, step=11):
"""Compute c_v coherence for various number of topics
if you want to change the parameters of the model while training,
call Model.params() first as it uses the same parameters.
NOTE: You cannot compute the coherence score of a saved model.
Args:
dictionary: Gensim dictionary
corpus: Gensim corpus
texts: List of input texts
limit: Max num of topics
Returns:
Dictionary of {num_topics, c_v}
"""
coherence_values = []
model_list = []
for num_topics in range(start, limit, step):
model = Wrappers.LdaMallet(self.mallet_path, \
corpus=self.corpus, num_topics=num_topics, \
alpha=self.alpha, id2word=self.id2word, \
workers=self.workers, prefix=self.prefix, \
optimize_interval=self.optimize_interval, \
iterations=self.iterations, \
topic_threshold=self.topic_threshold)
model_list.append(model)
coherencemodel = CoherenceModel(model=model, \
texts=self.tokens, dictionary=self.id2word, \
coherence='c_v')
coherence_values.append(coherencemodel.get_coherence())
x = range(start, limit, step)
out = dict()
for m, cv in zip(x, coherence_values):
out["num_topics"] = m
out["c_v"] = round(cv, 4)
return out
[docs] def load(self, saved_model):
"""Load a Mallet LDA model previously saved
Args:
saved_model: Location to saved model
Raises:
IOError: File already present or location does not exist
"""
try:
self.lda_model_mallet = utils.SaveLoad.load(saved_model)
except IOError:
raise IOError('File already present or location does not exist')