Source code for hypertools.tools.reduce

#!/usr/bin/env python

# libraries
import warnings
import numpy as np

## reduction models
from .._externals.ppca import PPCA
from sklearn.decomposition import PCA, FastICA, IncrementalPCA, KernelPCA, FactorAnalysis, TruncatedSVD, SparsePCA, MiniBatchSparsePCA, DictionaryLearning, MiniBatchDictionaryLearning
from sklearn.manifold import TSNE, MDS, SpectralEmbedding, LocallyLinearEmbedding, Isomap

# internal libraries
from ..tools.df2mat import df2mat
from ..tools.normalize import normalize as normalizer
from .._shared.helpers import *

# main function
[docs]def reduce(x, ndims=3, model='IncrementalPCA', model_params={}, normalize=False, internal=False, align=False): """ Reduces dimensionality of an array, or list of arrays Parameters ---------- x : Numpy array or list of arrays Dimensionality reduction using PCA is performed on this array. If there are nans present in the data, the function will try to use PPCA to interpolate the missing values. ndims : int Number of dimensions to reduce model : str Decomposition/manifold learning model to use. Models supported: PCA, IncrementalPCA, SparsePCA, MiniBatchSparsePCA, KernelPCA, FastICA, FactorAnalysis, TruncatedSVD, DictionaryLearning, MiniBatchDictionaryLearning, TSNE, Isomap, SpectralEmbedding, LocallyLinearEmbedding, and MDS. model_params : dict Optional dictionary of scikit-learn parameters to pass to reduction model. See scikit-learn specific model docs for details. normalize : str or False If set to 'across', the columns of the input data will be z-scored across lists (default). If set to 'within', the columns will be z-scored within each list that is passed. If set to 'row', each row of the input data will be z-scored. If set to False, the input data will be returned (default is False). align : bool If set to True, data will be run through the ``hyperalignment'' algorithm implemented in hypertools.tools.align (default: False). Returns ---------- x_reduced : Numpy array or list of arrays The reduced data with ndims dimensionality is returned. If the input is a list, a list is returned. """ # sub functions def fill_missing(x): # ppca if missing data m = PPCA() m.fit(data=np.vstack(x)) x_pca = m.transform() # if the whole row is missing, return nans all_missing = [idx for idx,a in enumerate(np.vstack(x)) if all([type(b)==np.nan for b in a])] if len(all_missing)>0: for i in all_missing: x_pca[i,:]=np.nan # get the original lists back if len(x)>1: x_split = np.cumsum([i.shape[0] for i in x][:-1]) return list(np.split(x_pca,x_split,axis=0)) else: return [x_pca] def reduce_list(x, model, model_params): split = np.cumsum([len(xi) for xi in x])[:-1] m=model(**model_params) x_r = np.vsplit(m.fit_transform(np.vstack(x)), split) if len(x)>1: return [xi for xi in x_r] else: return [x_r[0]] # dictionary of models models = { 'PCA' : PCA, 'IncrementalPCA' : IncrementalPCA, 'SparsePCA' : SparsePCA, 'MiniBatchSparsePCA' : MiniBatchSparsePCA, 'KernelPCA' : KernelPCA, 'FastICA' : FastICA, 'FactorAnalysis' : FactorAnalysis, 'TruncatedSVD' : TruncatedSVD, 'DictionaryLearning' : DictionaryLearning, 'MiniBatchDictionaryLearning' : MiniBatchDictionaryLearning, 'TSNE' : TSNE, 'Isomap' : Isomap, 'SpectralEmbedding' : SpectralEmbedding, 'LocallyLinearEmbedding' : LocallyLinearEmbedding, 'MDS' : MDS } # main x = format_data(x) assert all([i.shape[1]>ndims for i in x]), "In order to reduce the data, ndims must be less than the number of dimensions" # if there are any nans in any of the lists, use ppca if np.isnan(np.vstack(x)).any(): warnings.warn('Missing data: Inexact solution computed with PPCA (see https://github.com/allentran/pca-magic for details)') x = fill_missing(x) # normalize if normalize: x = normalizer(x, normalize=normalize) # build model params dict if model_params=={}: model_params = { 'n_components' : ndims } elif 'n_components' in model_params: pass else: model_params['n_components']=ndims # reduce data x_reduced = reduce_list(x, models[model], model_params) if align == True: # Import is here to avoid circular imports with reduce.py from .align import align as aligner x_reduced = aligner(x_reduced) # return data if internal or len(x_reduced)>1: return x_reduced else: return x_reduced[0]