Source code for sptm.conditional

# -*- coding: utf-8 -*-

"""
    Compute conditional probability matrix
"""

#######################################

import csv
import ast
import operator
import numpy as np

__author__ = "Rochan Avlur Venkat"
__credits__ = ["Anupam Mediratta"]
__license__ = "MIT"
__version__ = "1.0"
__maintainer__ = "Rochan Avlur Venkat"
__email__ = "rochan170543@mechyd.ac.in"

#######################################

[docs]class ConditionalMatrix: """Compute the conditional matrix of topics From the data used to train the LDA model, make a matrix of topics vs topics and compute the conditional probability of topic B occuring after topic A. Take this matrix and process it (sort and label it). Attributes: doc_matrix: Output from training the LDA model provided by Mallet. Contains the topic probabilities of each sentence. data_index: list containing the data index number for each sentence. num_sent: Number of sentences num_topics: Number of topics topic_freq: Sum of the weights for a topic over the whole dataset freq_matrix: Matrix of floats (conditional probabilities) labels: List of strings, labels for each topic, has to be manually labeled labeled: freq_matrix with labels sorted: freq_matrix with labels and sorted in decending order """ def __init__(self, doctopics_path, tokens_path): """Inits ConditionalMatrix with path to document topic file and tokens file Compute the conditional probability matrix of each sentence versus another. Open the document topic file, Get the data number for every sentence used Args: doctopics_path: Location of the document topic file tokens_path: Path to token file Raises: IOError: File not found Exception: Error while reading a row (Mostly due to empty row) """ self.doc_matrix = [] try: with open(doctopics_path, 'r') as F: reader = csv.reader(F, delimiter='\t') for row in reader: # Delete the first two columns in the file self.doc_matrix.append(row[2:]) except IOError: raise IOError('File not found') except Exception: raise Exception('Error reading doctopic file') self.data_index = [] try: with open(tokens_path, 'rU') as F: reader = csv.reader((line.replace('\0', '') for line in F), \ delimiter=',') for row in reader: try: if len(row) > 0: self.data_index.append(int(row[0])) except Exception: raise Exception('Some error while reading the row') except IOError: raise IOError('Error reading doctopic file') self.num_sent = len(self.doc_matrix) self.num_topics = len(self.doc_matrix[0])
[docs] def construct_matrix(self): """Compute the conditional probabilities Construct a simple frequency matrix of each topic (current sentence) vs topic (next sentence), Identify the topics with high probabilities for each sentence Raises: Exception: Sentence Missing, you can ignore this message """ # Create a dictionary of all the topic frequencies self.topic_freq = dict() for i in range(0, self.num_topics): self.topic_freq[i] = 0 # Construct an empty matrix self.freq_matrix = np.zeros((self.num_topics, self.num_topics), \ dtype=float) # Iterate over all the (sentences - 1) for i in range(self.num_sent - 1): try: if self.data_index[i] == self.data_index[i + 1]: topic_dict = [dict(), dict()] for j in range(0, self.num_topics): topic_dict[0][j] = \ ast.literal_eval(self.doc_matrix[i][j]) topic_dict[1][j] = \ ast.literal_eval(self.doc_matrix[i + 1][j]) for x in range(self.num_topics): for y in range(self.num_topics): self.freq_matrix[x][y] += \ float(topic_dict[1][y])*float(topic_dict[0][x]) self.topic_freq[x] += float(topic_dict[0][x]) except Exception: raise Exception('Sentence Missing, can ignore this message') # Conditional probability for i in range(self.num_topics): for j in range(self.num_topics): if float(self.topic_freq[i]) != 0: self.freq_matrix[i][j] = \ (float(self.freq_matrix[i][j]) / \ float(self.topic_freq[i])) * 65
[docs] def sort_and_label(self, labels_path): """Sort and label each value in the matrix Args: labels_path: Path to a labels file Raises: IOError: Labels file not found Exception: Error matching topics and labels, Error sorting Conditional Probabilities """ self.labels = [] try: with open(labels_path, 'r') as F: reader = csv.reader(F, delimiter=',') for row in reader: self.labels.append(row) except IOError: raise IOError('Labels file not found') self.labeled = [] try: for i in range(self.num_topics): self.labeled.append({}) self.labeled[i]['label'] = self.labels[i][0] for j in range(self.num_topics): self.labeled[i][self.labels[j][0]] = self.freq_matrix[i][j] except Exception: raise Exception('Error matching topics and labels') self.sorted = [] try: for i in range(self.num_topics): sort_row = sorted(self.labeled[i].items(), \ key=operator.itemgetter(1)) self.sorted.append(sort_row) except Exception: raise Exception('Error sorting Conditional Probabilities')
[docs] def save(self, output_path, matrix): """Save matrix Args: output_path: Location with filename to save matrix matrix: Matrix to save Raises: IOError: Output path does not exist """ try: with open(output_path, 'w') as F: writer = csv.writer(F, delimiter=',') for r in matrix: writer.writerow(r) except IOError: raise IOError('Path does not exist')