Module mogptk.data
Expand source code Browse git
import re
import copy
import inspect
import datetime
import logging
import numpy as np
import pandas as pd
from scipy import signal
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from pandas.plotting import register_matplotlib_converters
from .bnse import bse
from .serie import Serie, TransformLinear
register_matplotlib_converters()
logger = logging.getLogger('mogptk')
def LoadFunction(f, start, end, n, var=0.0, name="", random=False):
"""
LoadFunction loads a dataset from a given function y = f(x) + Normal(0,var). It will pick `n` data points between start and end for the X axis for which `f` is being evaluated. By default the `n` points are spread equally over the interval, with `random=True` they will be picked randomly.
The given function should take one argument X of shape (n,input_dims) and return Y of shape (n,). If your data has only one input dimension, you can use X[:,0] to select the first (and only) input dimension.
Args:
f (function): Function taking X with shape (n,input_dims) and returning shape (n,) as Y.
start (float, list): Define start of interval.
end (float, list): Define end of interval.
n (int): Number of data points to pick between start and end.
var (float): Variance added to the output.
name (str): Name of data.
random (boolean): Select points randomly between start and end.
Returns:
mogptk.data.Data
Examples:
>>> LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave')
<mogptk.data.Data at ...>
"""
if type(start) is not type(end):
raise ValueError("start and end must be of the same type")
if isinstance(start, np.ndarray):
if start.ndim == 0:
start = [start.item()]
end = [end.item()]
else:
start = list(start)
end = list(end)
if not isinstance(start, list):
start = [start]
end = [end]
if len(start) != len(end):
raise ValueError("start and end must be of the same length")
if not _is_homogeneous_type(start + end):
raise ValueError("start and end must have elements of the same type")
if isinstance(start[0], datetime.datetime) or isinstance(start[0], str) or isinstance(start[0], np.datetime64):
# convert datetime.datetime or strings to np.datetime64
for i in range(len(start)):
try:
start[i] = np.datetime64(start[i], 'us')
end[i] = np.datetime64(end[i], 'us')
except:
raise ValueError("start and end must have a number or datetime data type")
else:
for i in range(len(start)):
try:
start[i] = np.float64(start[i])
end[i] = np.float64(end[i])
except:
raise ValueError("start and end must have a number or datetime data type")
input_dims = len(start)
is_datetime64 = isinstance(start[0], np.datetime64)
_check_function(f, input_dims, is_datetime64)
if is_datetime64:
if random:
raise ValueError("cannot use random for datetime inputs")
x = np.empty((n, input_dims), dtype=start[0].dtype)
else:
x = np.empty((n, input_dims))
for i in range(input_dims):
if start[i] >= end[i]:
if input_dims == 1:
raise ValueError("start must be lower than end")
else:
raise ValueError("start must be lower than end for input dimension %d" % (i,))
if is_datetime64:
dt = (end[i]-start[i]) / float(n-1)
dt = _timedelta64_to_higher_unit(dt)
x[:,i] = np.arange(start[i], start[i]+dt*(n-1)+np.timedelta64(1,'us'), dt)
elif random:
x[:,i] = np.random.uniform(start[i], end[i], n)
else:
x[:,i] = np.linspace(start[i], end[i], n)
y = f(x)
if y.ndim == 2 and y.shape[1] == 1:
y = y[:,0]
y += np.random.normal(0.0, var, n)
data = Data(x, y, name=name)
data.set_function(f)
return data
################################################################
################################################################
################################################################
class Data:
def __init__(self, X, Y, name=None, x_labels=None, y_label=None):
"""
Data class that holds all observations, latent functions and predicted data.
This class accepts the data directly, otherwise you can load data conveniently using `LoadFunction`, `LoadCSV`, `LoadDataFrame`, etc. The data class allows to modify the data before passing into the model. Examples are transforming data, such as detrending or taking the log, removing data ranges to simulate sensor failure, and aggregating data for given spans on X, such as aggregating daily data into weekly data. Additionally, we also use this class to set the range we want to predict.
It is possible to use the format given by `numpy.meshgrid` for X and its values in Y.
Args:
X (list, numpy.ndarray, dict): Independent variable data of shape (n,) or (n,input_dims).
Y (list, numpy.ndarray): Dependent variable data of shape (n,).
name (str): Name of data.
x_labels (str, list of str): Name or names of input dimensions.
y_label (str): Name of output dimension.
Examples:
>>> channel = mogptk.Data([0, 1, 2, 3], [4, 3, 5, 6])
"""
# convert dicts to lists
if x_labels is not None:
if isinstance(x_labels, str):
x_labels = [x_labels]
if not isinstance(x_labels, list) or not all(isinstance(label, str) for label in x_labels):
raise ValueError("x_labels must be a string or list of strings for each input dimension")
if isinstance(X, dict):
it = iter(X.values())
first = len(next(it))
if not all(isinstance(x, (list, np.ndarray)) for x in X.values()) or not all(len(x) == first for x in it):
raise ValueError("X dict should contain all lists or np.ndarrays where each has the same length")
if not all(key in X for key in x_labels):
raise ValueError("X dict must contain all keys listed in x_labels")
X = list(map(list, zip(*[X[key] for key in x_labels])))
# check if X and Y are correct inputs
if isinstance(X, list):
if all(isinstance(x, list) for x in X):
m = len(X[0])
if not all(len(x) == m for x in X[1:]):
raise ValueError("X list items must all be lists of the same length")
if not all(all(isinstance(val, (int, float, datetime.datetime, np.datetime64)) for val in x) for x in X):
raise ValueError("X list items must all be lists of numbers or datetime")
if not _is_homogeneous_type(x):
raise ValueError("X list items must all be lists with elements of the same type")
elif all(isinstance(x, np.ndarray) for x in X):
m = len(X[0])
if not all(len(x) == m for x in X[1:]):
raise ValueError("X list items must all be numpy.ndarrays of the same length")
elif not all(isinstance(x, (int, float, datetime.datetime, np.datetime64)) for x in X):
raise ValueError("X list items must be all lists, all numpy.ndarrays, or all numbers or datetime")
elif not _is_homogeneous_type(X):
raise ValueError("X list items must all have elements of the same type")
X = np.array(X)
if isinstance(Y, list):
if not all(isinstance(y, (int, float)) for y in Y):
raise ValueError("Y list items must all be numbers")
elif not _is_homogeneous_type(Y):
raise ValueError("Y list items must all have elements of the same type")
Y = np.array(Y)
if not isinstance(X, np.ndarray) or not isinstance(Y, np.ndarray):
raise ValueError("X and Y must be lists or numpy arrays, if dicts are passed then x_labels and/or y_label must also be set")
# try to cast unknown data types, X becomes np.float64 or np.datetime64, and Y becomes mp.float64
if X.dtype == np.object_ or np.issubdtype(X.dtype, np.character):
# convert datetime.datetime or strings to np.datetime64
try:
X = X.astype(np.datetime64)
except:
raise ValueError("X data must have a number or datetime data type")
elif not np.issubdtype(X.dtype, np.datetime64):
try:
X = X.astype(np.float64)
except:
raise ValueError("X data must have a number or datetime data type")
try:
Y = Y.astype(np.float64)
except:
raise ValueError("Y data must have a number data type")
# convert X datetime64[us] to a higher unit like s, m, h, D, ...
if np.issubdtype(X.dtype, np.datetime64):
X = _datetime64_to_higher_unit(X)
# convert meshgrids to flat arrays
if 2 < X.ndim and 1 < Y.ndim and X.shape[1:] == Y.shape:
X = np.vstack(list(map(np.ravel, X))).T
Y = np.ravel(Y)
if X.ndim == 1:
X = X.reshape(-1, 1)
if X.ndim != 2:
raise ValueError("X must be either a one or two dimensional array of data")
if Y.ndim != 1:
raise ValueError("Y must be a one dimensional array of data")
if X.shape[0] != Y.shape[0]:
raise ValueError("X and Y must be of the same length")
if Y.shape[0] == 0:
raise ValueError("X and Y must have a length greater than zero")
input_dims = X.shape[1]
self.X = [Serie(X[:,i]) for i in range(input_dims)] # [shape (n)] * input_dims
self.Y = Serie(Y) # shape (n)
self.mask = np.array([True] * Y.shape[0])
self.F = None
self.X_pred = self.X
self.Y_mu_pred = {}
self.Y_var_pred = {}
self.removed_ranges = [[]] * input_dims
self.X_labels = ['X'] * input_dims
if 1 < input_dims:
for i in range(input_dims):
self.X_labels[i] = 'X%d' % (i,)
if isinstance(x_labels, list) and all(isinstance(item, str) for item in x_labels):
self.X_labels = x_labels
self.name = None
if isinstance(name, str):
self.name = name
elif isinstance(y_label, str):
self.name = y_label
self.Y_label = 'Y'
if isinstance(y_label, str):
self.Y_label = y_label
def __repr__(self):
df = pd.DataFrame()
for i in range(len(self.X)):
df[self.X_labels[i]] = self.X[i]
df[self.Y_label] = self.Y
return repr(df)
def copy(self):
"""
Make a deep copy of `Data`.
Returns:
mogptk.data.Data
Examples:
>>> other = data.copy()
"""
return copy.deepcopy(self)
def set_name(self, name):
"""
Set name for data channel.
Args:
name (str): Name of data.
Examples:
>>> data.set_name('Channel A')
"""
self.name = name
def set_labels(self, x_labels, y_label):
"""
Set axis labels for plots.
Args:
x_labels (str, list of str): X data names for each input dimension.
y_label (str): Y data name for output dimension.
Examples:
>>> data.set_labels(['X', 'Y'], 'Cd')
"""
if isinstance(x_labels, str):
x_labels = [x_labels]
elif not isinstance(x_labels, list) or not all(isinstance(item, str) for item in x_labels):
raise ValueError("x_labels must be list of strings")
if not isinstance(y_label, str):
raise ValueError("y_label must be string")
if len(x_labels) != self.get_input_dims():
raise ValueError("x_labels must have the same input dimensions as the data")
self.X_labels = x_labels
self.Y_label = y_label
def set_function(self, f):
"""
Set the (latent) function for the data, ie. the theoretical or true signal. This is used for plotting purposes and is optional.
The function should take one argument X of shape (n,input_dims) and return Y of shape (n,). If your data has only one input dimension, you can use X[:,0] to select the first (and only) input dimension.
Args:
f (function): Function taking X with shape (n,input_dims) and returning shape (n,) as Y.
Examples:
>>> data.set_function(lambda x: np.sin(3*x[:,0])
"""
_check_function(f, self.get_input_dims(), self.X[0].is_datetime64())
self.F = f
def rescale_x(self, upper=1000.0):
"""
Rescale the X axis so that it is in the interval [0.0,upper]. This helps training most kernels.
Args:
upper (float): Upper end of the interval.
Examples:
>>> data.rescale_x()
"""
for i in range(self.get_input_dims()):
X = self.X[i].transformed
xmin = np.min(X)
xmax = np.max(X)
t = TransformLinear(xmin, (xmax-xmin)/upper)
t.set_data(X)
self.X[i].apply(t)
def transform(self, transformer):
"""
Transform the Y axis data by using one of the provided transformers, such as `TransformDetrend`, `TransformLinear`, `TransformLog`, `TransformNormalize`, `TransformStandard`, etc.
Args:
transformer (obj): Transformer object derived from TransformBase.
Examples:
>>> data.transform(mogptk.TransformDetrend(degree=2)) # remove polynomial trend
>>> data.transform(mogptk.TransformLinear(slope=1, bias=2)) # remove linear trend
>>> data.transform(mogptk.TransformLog) # log transform the data
>>> data.transform(mogptk.TransformNormalize) # transform to [-1,1]
>>> data.transform(mogptk.TransformStandard) # transform to mean=0, var=1
"""
t = transformer
if isinstance(t, type):
t = transformer()
else:
t = copy.deepcopy(t)
t.set_data(self)
self.Y.apply(t, np.array([x for x in self.X]).T)
def filter(self, start, end):
"""
Filter the data range to be between `start` and `end` in the X axis.
Args:
start (float, str): Start of interval.
end (float, str): End of interval.
Examples:
>>> data.filter(3, 8)
>>> data.filter('2016-01-15', '2016-06-15')
"""
if self.get_input_dims() != 1:
raise ValueError("can only filter on one dimensional input data")
start = self._normalize_val(start)
end = self._normalize_val(end)
ind = (self.X[0] >= start[0]) & (self.X[0] < end[0])
self.X[0] = self.X[0][ind]
self.Y = self.Y[ind]
self.mask = self.mask[ind]
def aggregate(self, duration, f=np.mean):
"""
Aggregate the data by duration and apply a function to obtain a reduced dataset.
For example, group daily data by week and take the mean. The duration can be set as a number which defined the intervals on the X axis, or by a string written in the duration format in case the X axis has data type `numpy.datetime64`. The duration format uses: Y=year, M=month, W=week, D=day, h=hour, m=minute, and s=second. For example, 3W1D means three weeks and one day, ie. 22 days, or 6M to mean six months.
Args:
duration (float, str): Duration along the X axis or as a string in the duration format.
f (function): Function to use to reduce data.
Examples:
>>> data.aggregate(5)
>>> data.aggregate('2W', f=np.sum)
"""
if self.get_input_dims() != 1:
raise ValueError("can only aggregate on one dimensional input data")
start = np.min(self.X[0])
end = np.max(self.X[0])
step = _parse_delta(duration)
X = np.arange(start+step/2, end+step/2, step)
Y = np.empty((len(X)))
for i in range(len(X)):
ind = (self.X[0] >= X[i]-step/2) & (self.X[0] < X[i]+step/2)
Y[i] = f(self.Y[ind])
self.X = [Serie(X, self.X[0].transformers)]
self.Y = Serie(Y, self.Y.transformers, np.array([x for x in self.X]).T)
self.mask = np.array([True] * len(self.X[0]))
################################################################
def get_name(self):
"""
Return the name of the channel.
Returns:
str
Examples:
>>> data.get_name()
'A'
"""
return self.name
def has_test_data(self):
"""
Returns True if observations have been removed using the `remove_*` methods.
Returns:
boolean
Examples:
>>> data.has_test_data()
True
"""
return False in self.mask
def get_input_dims(self):
"""
Returns the number of input dimensions.
Returns:
int: Number of input dimensions.
Examples:
>>> data.get_input_dims()
2
"""
return len(self.X)
def get_data(self, transformed=False):
"""
Returns all observations, train and test.
Arguments:
transformed (boolean): Return transformed data.
Returns:
numpy.ndarray: X data of shape (n,input_dims).
numpy.ndarray: Y data of shape (n,).
Examples:
>>> x, y = data.get_data()
"""
if transformed:
return np.array([x for x in self.X]).T, self.Y.transformed
return np.array([x for x in self.X]).T, np.array(self.Y)
def get_train_data(self, transformed=False):
"""
Returns the observations used for training.
Arguments:
transformed (boolean): Return transformed data.
Returns:
numpy.ndarray: X data of shape (n,input_dims).
numpy.ndarray: Y data of shape (n,).
Examples:
>>> x, y = data.get_train_data()
"""
if transformed:
return np.array([x[self.mask] for x in self.X]).T, self.Y.transformed[self.mask]
return np.array([x[self.mask] for x in self.X]).T, np.array(self.Y[self.mask])
def get_test_data(self, transformed=False):
"""
Returns the observations used for testing which correspond to the removed points.
Arguments:
transformed (boolean): Return transformed data.
Returns:
numpy.ndarray: X data of shape (n,input_dims).
numpy.ndarray: Y data of shape (n,).
Examples:
>>> x, y = data.get_test_data()
"""
X = np.array([x[~self.mask] for x in self.X]).T
if self.F is not None:
if X.shape[0] == 0:
X, _ = self.get_data()
Y = self.F(X)
if transformed:
Y = self.Y.transform(Y, X)
return X, Y
if transformed:
return X, self.Y.transformed[~self.mask]
return X, np.array(self.Y[~self.mask])
################################################################
def reset(self):
"""
Reset the data set and undo the removal of data points. That is, this reverts any calls to `remove_randomly`, `remove_range`, `remove_relative_range`, `remove_random_ranges`, and `remove_index`.
"""
self.mask[:] = True
for i in range(len(self.removed_ranges)):
self.removed_ranges[i] = []
def remove_randomly(self, n=None, pct=None):
"""
Removes observations randomly on the whole range. Either `n` observations are removed, or a percentage of the observations.
Args:
n (int): Number of observations to remove randomly.
pct (float): Percentage in interval [0,1] of observations to remove randomly.
Examples:
>>> data.remove_randomly(50) # remove 50 observations
>>> data.remove_randomly(pct=0.9) # remove 90% of the observations
"""
if n is None:
if pct is None:
n = 0
else:
n = int(pct * len(self.Y))
idx = np.random.choice(len(self.Y), n, replace=False)
self.mask[idx] = False
def remove_range(self, start=None, end=None):
"""
Removes observations in the interval `[start,end]`.
Args:
start (float, str): Start of interval. Defaults to the first value in observations.
end (float, str): End of interval. Defaults to the last value in observations.
Examples:
>>> data = mogptk.LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave')
>>> data.remove_range(3, 8)
>>> data = mogptk.LoadCSV('gold.csv', 'Date', 'Price')
>>> data.remove_range('2016-01-15', '2016-06-15')
"""
if self.get_input_dims() != 1:
raise ValueError("can only remove ranges on one dimensional input data")
if start is None:
start = np.min(self.X[0])
if end is None:
end = np.max(self.X[0])
start = self._normalize_val(start)
end = self._normalize_val(end)
idx = np.where(np.logical_and(self.X[0] >= start[0], self.X[0] <= end[0]))
self.mask[idx] = False
self.removed_ranges[0].append([start[0], end[0]])
def remove_relative_range(self, start=0.0, end=1.0):
"""
Removes observations between `start` and `end` as a percentage of the number of observations. So `0` is the first observation, `0.5` is the middle observation, and `1` is the last observation.
Args:
start (float): Start percentage in interval [0,1].
end (float): End percentage in interval [0,1].
"""
if self.get_input_dims() != 1:
raise ValueError("can only remove ranges on one dimensional input data")
start = self._normalize_val(start)
end = self._normalize_val(end)
x_min = np.min(self.X[0])
x_max = np.max(self.X[0])
for i in range(self.get_input_dims()):
start[i] = x_min + max(0.0, min(1.0, start[i])) * (x_max-x_min)
end[i] = x_min + max(0.0, min(1.0, end[i])) * (x_max-x_min)
idx = np.where(np.logical_and(self.X[0] >= start[0], self.X[0] <= end[0]))
self.mask[idx] = False
self.removed_ranges[0].append([start[0], end[0]])
def remove_random_ranges(self, n, duration):
"""
Removes a number of ranges to simulate sensor failure. May remove fewer ranges if there is no more room to remove a range in the remaining data.
Args:
n (int): Number of ranges to remove.
duration (float, str): Width of ranges to remove, can use a number or the duration format syntax (see aggregate()).
Examples:
>>> data.remove_random_ranges(2, 5) # remove two ranges that are 5 wide in input space
>>> data.remove_random_ranges(3, '1d') # remove three ranges that are 1 day wide
"""
if self.get_input_dims() != 1:
raise ValueError("can only remove ranges on one dimensional input data")
if n < 1:
return
delta = _parse_delta(duration)
m = (np.max(self.X[0])-np.min(self.X[0])) - n*delta
if m <= 0:
raise ValueError("no data left after removing ranges")
locs = self.X[0] <= (np.max(self.X[0])-delta)
locs[sum(locs)] = True # make sure the last data point can be deleted
for i in range(n):
if len(self.X[0][locs]) == 0:
break # range could not be removed, there is no remaining data range of width delta
x = self.X[0][locs][np.random.randint(len(self.X[0][locs]))]
locs[(self.X[0] > x-delta) & (self.X[0] < x+delta)] = False
self.mask[(self.X[0] >= x) & (self.X[0] < x+delta)] = False
self.removed_ranges[0].append([x, x+delta])
def remove_index(self, index):
"""
Removes observations of given index
Args:
index(list, numpy.ndarray): Array of indexes of the data to remove.
"""
if isinstance(index, list):
index = np.array(index)
elif not isinstance(index, np.ndarray):
raise ValueError("index must be list or numpy array")
self.mask[index] = False
################################################################
def get_prediction_names(self):
"""
Returns the model names of the saved predictions.
Returns:
list: List of prediction names.
Examples:
>>> data.get_prediction_names()
['MOSM', 'CSM', 'SM-LMC', 'CONV']
"""
return self.Y_mu_pred.keys()
def get_prediction_x(self):
"""
Returns the prediction X range.
Returns:
numpy.ndarray: X prediction of shape (n,input_dims).
Examples:
>>> x = data.get_prediction_x()
"""
return np.array([x for x in self.X_pred]).T
def get_prediction(self, name, sigma=2.0, transformed=False):
"""
Returns the prediction of a given name with a confidence interval of `sigma` times the standard deviation.
Args:
name (str): Name of the model of the prediction.
sigma (float): The confidence interval's number of standard deviations.
transformed (boolean): Return transformed data as used for training.
Returns:
numpy.ndarray: X prediction of shape (n,input_dims).
numpy.ndarray: Y mean prediction of shape (n,).
numpy.ndarray: Y lower prediction of uncertainty interval of shape (n,).
numpy.ndarray: Y upper prediction of uncertainty interval of shape (n,).
Examples:
>>> x, y_mean, y_var_lower, y_var_upper = data.get_prediction('MOSM', sigma=1)
"""
if name not in self.Y_mu_pred:
raise ValueError("prediction name '%s' does not exist" % (name))
X = np.array([x for x in self.X_pred]).T
mu = self.Y_mu_pred[name]
lower = mu - sigma * np.sqrt(self.Y_var_pred[name])
upper = mu + sigma * np.sqrt(self.Y_var_pred[name])
if transformed:
return X, mu, lower, upper
mu = Serie(self.Y.detransform(mu, X), self.Y.transformers, transformed=mu)
lower = Serie(self.Y.detransform(lower, X), self.Y.transformers, transformed=lower)
upper = Serie(self.Y.detransform(upper, X), self.Y.transformers, transformed=upper)
return X, mu, lower, upper
def set_prediction_x(self, X):
"""
Set the prediction range directly for saved predictions. This will clear old predictions.
Args:
X (list, numpy.ndarray): Array of shape (n,) or (n,input_dims) used for predictions.
Examples:
>>> data.set_prediction_x([5.0, 5.5, 6.0, 6.5, 7.0])
"""
# TODO: accept multiple input dims and make sure dtype equals X
if isinstance(X, list):
X = np.array(X)
elif not isinstance(X, np.ndarray):
raise ValueError("X expected to be a list or numpy.ndarray")
X = X.astype(np.float64)
if X.ndim == 1:
X = X.reshape(-1, 1)
if X.ndim != 2 or X.shape[1] != self.get_input_dims():
raise ValueError("X shape must be (n,input_dims)")
self.X_pred = [Serie(X[:,i], self.X[i].transformers) for i in range(self.get_input_dims())]
# clear old prediction data now that X_pred has been updated
self.clear_predictions()
def set_prediction_range(self, start=None, end=None, n=None, step=None):
"""
Sets the prediction range. The interval is set as `[start,end]`, with either `n` points or a given step between the points.
Args:
start (float, str): Start of interval, defaults to the first observation.
end (float, str): End of interval, defaults to the last observation.
n (int): Number of points to generate in the interval.
step (float, str): Spacing between points in the interval.
If neither step or n is passed, default number of points is 100.
Examples:
>>> data = mogptk.LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave')
>>> data.set_prediction_range(3, 8, 200)
>>> data = mogptk.LoadCSV('gold.csv', 'Date', 'Price')
>>> data.set_prediction_range('2016-01-15', '2016-06-15', step='1d')
"""
# TODO: accept multiple input dims and make sure dtype equals X
if self.get_input_dims() != 1:
raise ValueError("can only set prediction range on one dimensional input data")
if start is None:
start = [x[0] for x in self.X]
if end is None:
start = [x[-1] for x in self.X]
start = self._normalize_val(start)
end = self._normalize_val(end)
# TODO: works for multi input dims?
if end <= start:
raise ValueError("start must be lower than end")
# TODO: prediction range for multi input dimension; fix other axes to zero so we can plot?
X_pred = [np.array([])] * self.get_input_dims()
if step is None and n is not None:
for i in range(self.get_input_dims()):
X_pred[i] = start[i] + (end[i]-start[i])*np.linspace(0.0, 1.0, n)
else:
if self.get_input_dims() != 1:
raise ValueError("cannot use step for multi dimensional input, use n")
if step is None:
step = (end[0]-start[0])/100
else:
step = _parse_delta(step)
X_pred[0] = np.arange(start[0], end[0]+step, step)
self.X_pred = [Serie(x, self.X[i].transformers) for i, x in enumerate(X_pred)]
# clear old prediction data now that X_pred has been updated
self.clear_predictions()
def clear_predictions(self):
"""
Clear all saved predictions.
"""
self.Y_mu_pred = {}
self.Y_var_pred = {}
################################################################
def get_nyquist_estimation(self):
"""
Estimate the Nyquist frequency by taking 0.5/(minimum distance of points).
Returns:
numpy.ndarray: Nyquist frequency array of shape (input_dims,).
Examples:
>>> freqs = data.get_nyquist_estimation()
"""
input_dims = self.get_input_dims()
nyquist = np.empty((input_dims))
for i in range(self.get_input_dims()):
x = np.sort(self.X[i].transformed[self.mask])
dist = np.abs(x[1:]-x[:-1])
dist = np.min(dist[np.nonzero(dist)])
nyquist[i] = 0.5/dist
return nyquist
def get_lombscargle_estimation(self, Q=1, n=10000):
"""
Peak estimation of the spectrum using Lomb-Scargle.
Args:
Q (int): Number of peaks to find.
n (int): Number of points to use for Lomb-Scargle.
Returns:
numpy.ndarray: Amplitude array of shape (Q,input_dims).
numpy.ndarray: Frequency array of shape (Q,input_dims).
numpy.ndarray: Variance array of shape (Q,input_dims).
Examples:
>>> amplitudes, means, variances = data.get_lombscargle_estimation()
"""
input_dims = self.get_input_dims()
# Gaussian: f(x) = A * exp((x-B)^2 / (2C^2))
# i.e. A is the amplitude or peak height, B the mean or peak position, and C the std.dev. or peak width
A = np.zeros((Q, input_dims))
B = np.zeros((Q, input_dims))
C = np.zeros((Q, input_dims))
nyquist = self.get_nyquist_estimation()
for i in range(input_dims):
x, y = np.array([x.transformed[self.mask] for x in self.X]).T, self.Y.transformed[self.mask]
freq = np.linspace(0, nyquist[i], n+1)[1:]
psd = signal.lombscargle(x[:,i]*2.0*np.pi, y, freq)
ind, _ = signal.find_peaks(psd)
ind = ind[np.argsort(psd[ind])[::-1]] # sort by biggest peak first
widths, width_heights, _, _ = signal.peak_widths(psd, ind, rel_height=0.5)
widths *= freq[1]-freq[0]
positions = freq[ind]
amplitudes = psd[ind]
# from full-width half-maximum to Gaussian sigma
# note that amplitudes / width_heights is near 2 when the base of the peak is near zero
stddevs = widths / np.sqrt(8 * np.log(amplitudes / width_heights))
if Q < len(amplitudes):
amplitudes = amplitudes[:Q]
positions = positions[:Q]
stddevs = stddevs[:Q]
n = len(amplitudes)
A[:n,i] = np.sqrt(amplitudes)
B[:n,i] = positions
C[:n,i] = stddevs
return A, B, C
def get_bnse_estimation(self, Q=1, n=1000):
"""
Peak estimation of the spectrum using BNSE (Bayesian Non-parametric Spectral Estimation).
Args:
Q (int): Number of peaks to find.
n (int): Number of points of the grid to evaluate frequencies.
Returns:
numpy.ndarray: Amplitude array of shape (Q,input_dims).
numpy.ndarray: Frequency array of shape (Q,input_dims).
numpy.ndarray: Variance array of shape (Q,input_dims).
Examples:
>>> amplitudes, means, variances = data.get_bnse_estimation()
"""
input_dims = self.get_input_dims()
# Gaussian: f(x) = A * exp((x-B)^2 / (2C^2))
# Ie. A is the amplitude or peak height, B the mean or peak position, and C the variance or peak width
A = np.zeros((Q, input_dims))
B = np.zeros((Q, input_dims))
C = np.zeros((Q, input_dims))
nyquist = self.get_nyquist_estimation()
for i in range(input_dims):
x, y = np.array([x.transformed[self.mask] for x in self.X]).T, self.Y.transformed[self.mask]
bnse = bse(x[:,i], y)
bnse.set_freqspace(nyquist[i], dimension=n)
bnse.train()
bnse.compute_moments()
amplitudes, positions, variances = bnse.get_freq_peaks()
if len(positions) == 0:
continue
if Q < len(amplitudes):
amplitudes = amplitudes[:Q]
positions = positions[:Q]
variances = variances[:Q]
num = len(amplitudes)
# division by 100 makes it similar to other estimators (emperically found)
A[:num,i] = np.sqrt(amplitudes) / 100
B[:num,i] = positions
C[:num,i] = variances
return A, B, C
def get_sm_estimation(self, Q=1, method='BNSE', optimizer='Adam', iters=100, params={}, plot=False):
"""
Peak estimation of the spectrum using the spectral mixture kernel.
Args:
Q (int): Number of peaks to find.
method (str): Method of estimating SM kernels.
optimizer (str): Optimization method for SM kernels.
iters (str): Maximum iteration for SM kernels.
params (object): Additional parameters for the PyTorch optimizer.
plot (bool): Show the PSD of the kernel after fitting.
Returns:
numpy.ndarray: Amplitude array of shape (Q,input_dims).
numpy.ndarray: Frequency array of shape (Q,input_dims).
numpy.ndarray: Variance array of shape (Q,input_dims).
Examples:
>>> amplitudes, means, variances = data.get_sm_estimation()
"""
from .sm import SM
input_dims = self.get_input_dims()
# Gaussian: f(x) = A * exp((x-B)^2 / (2C^2))
# Ie. A is the amplitude or peak height, B the mean or peak position, and C the variance or peak width
A = np.zeros((Q, input_dims))
B = np.zeros((Q, input_dims))
C = np.zeros((Q, input_dims))
sm = SM(self, Q)
sm.init_parameters(method)
sm.train(method=optimizer, iters=iters, **params)
if plot:
nyquist = self.get_nyquist_estimation()
means = np.array([sm.model.kernel[0][q].mean() for q in range(Q)])
weights = np.array([sm.model.kernel[0][q].weight() for q in range(Q)])
scales = np.array([sm.model.kernel[0][q].variance() for q in range(Q)])
plot_spectrum(means, scales, weights=weights, nyquist=nyquist, title=self.name)
for q in range(Q):
A[q,:] = sm.kernel[0][q].weight.numpy() # TODO: weight is not per input_dims
B[q,:] = sm.kernel[0][q].mean.numpy()
C[q,:] = sm.kernel[0][q].variance.numpy()
return A, B, C
def plot(self, pred=None, title=None, ax=None, legend=True, transformed=False):
"""
Plot the data including removed observations, latent function, and predictions.
Args:
pred (str): Specify model name to draw.
title (str): Set the title of the plot.
ax (matplotlib.axes.Axes): Draw to this axes, otherwise draw to the current axes.
legend (boolean): Display legend.
transformed (boolean): Display transformed Y data as used for training.
Returns:
matplotlib.figure.Figure: only if `ax` is not set
matplotlib.axes.Axes: only if `ax` is not set
Examples:
>>> ax = data.plot()
"""
# TODO: ability to plot conditional or marginal distribution to reduce input dims
if self.get_input_dims() > 2:
raise ValueError("cannot plot more than two input dimensions")
if self.get_input_dims() == 2:
raise NotImplementedError("two dimensional input data not yet implemented") # TODO
fig = None
if ax is None:
fig, ax = plt.subplots(1, 1, figsize=(12, 3.0), squeeze=True, constrained_layout=True)
legends = []
colors = list(matplotlib.colors.TABLEAU_COLORS)
for i, name in enumerate(self.Y_mu_pred):
if self.Y_mu_pred[name].size != 0 and (pred is None or name.lower() == pred.lower()):
X_pred, mu, lower, upper = self.get_prediction(name, transformed=transformed)
idx = np.argsort(X_pred[:,0])
ax.plot(X_pred[:,0][idx], mu[idx], ls='-', color=colors[i], lw=2)
ax.fill_between(X_pred[:,0][idx], lower[idx], upper[idx], color=colors[i], alpha=0.1)
#ax.plot(X_pred[:,0][idx], lower[idx], ls='-', color=colors[i], lw=1, alpha=0.5)
#ax.plot(X_pred[:,0][idx], upper[idx], ls='-', color=colors[i], lw=1, alpha=0.5)
label = 'Prediction'
if name is not None:
label += ' ' + name
legends.append(plt.Line2D([0], [0], ls='-', color=colors[i], lw=2, label=label))
if self.F is not None:
xmin = min(np.min(self.X[0]), np.min(self.X_pred[0]))
xmax = max(np.max(self.X[0]), np.max(self.X_pred[0]))
if np.issubdtype(self.X[0].dtype, np.datetime64):
dt = np.timedelta64(1,self.X[0].get_time_unit())
n = int((xmax-xmin) / dt) + 1
x = np.empty((n, 1), dtype=self.X[0].dtype)
x[:,0] = np.arange(xmin, xmax+np.timedelta64(1,'us'), dt)
else:
n = len(self.X[0])*10
x = np.empty((n, 1))
x[:,0] = np.linspace(xmin, xmax, n)
y = self.F(x)
if transformed:
y = self.Y.transform(y, x)
ax.plot(x[:,0], y, 'r--', lw=1)
legends.append(plt.Line2D([0], [0], ls='--', color='r', label='True'))
_, Y = self.get_data(transformed=transformed)
idx = np.argsort(self.X[0])
ax.plot(self.X[0][idx], Y[idx], 'k--', alpha=0.8)
legends.append(plt.Line2D([0], [0], ls='--', color='k', label='All Points'))
x, y = self.get_train_data(transformed=transformed)
if 1000 < x.shape[0]:
ax.plot(x[:,0], y, 'k-')
legends.append(plt.Line2D([0], [0], ls='-', color='k', label='Training Points'))
else:
ax.plot(x[:,0], y, 'k.', mew=1, ms=13, markeredgecolor='white')
legends.append(plt.Line2D([0], [0], ls='', color='k', marker='.', ms=10, label='Training Points'))
if self.has_test_data():
for removed_range in self.removed_ranges[0]:
x0 = removed_range[0]
x1 = removed_range[1]
y0 = ax.get_ylim()[0]
y1 = ax.get_ylim()[1]
ax.add_patch(patches.Rectangle(
(x0, y0), x1-x0, y1-y0, fill=True, color='xkcd:strawberry', alpha=0.2, lw=0,
))
legends.append(patches.Rectangle(
(1, 1), 1, 1, fill=True, color='xkcd:strawberry', alpha=0.5, lw=0, label='Removed Ranges'
))
xmin = min(self.X[0].min(), self.X_pred[0].min())
xmax = max(self.X[0].max(), self.X_pred[0].max())
ax.set_xlim(xmin - (xmax - xmin)*0.001, xmax + (xmax - xmin)*0.001)
ax.set_xlabel(self.X_labels[0])
ax.set_ylabel(self.Y_label)
ax.set_title(self.name if title is None else title, fontsize=14)
if legend:
legend_rows = (len(legends)-1)/5 + 1
ax.legend(handles=legends, loc="upper center", bbox_to_anchor=(0.5,(3.0+0.5+0.3*legend_rows)/3.0), ncol=5)
if fig is not None:
return fig, ax
def plot_spectrum(self, title=None, method='ls', ax=None, per=None, maxfreq=None, transformed=False):
"""
Plot the spectrum of the data.
Args:
title (str): Set the title of the plot.
method (str): Set the method to get the spectrum such as LS or BNSE.
ax (matplotlib.axes.Axes): Draw to this axes, otherwise draw to the current axes.
per (str, float, numpy.timedelta64): Set the scale of the X axis depending on the formatter used, eg. per=5, per='day', or per='3D'.
maxfreq (float): Maximum frequency to plot, otherwise the Nyquist frequency is used.
transformed (boolean): Display transformed Y data as used for training.
Returns:
matplotlib.axes.Axes
Examples:
>>> ax = data.plot_spectrum(method='bnse')
"""
# TODO: ability to plot conditional or marginal distribution to reduce input dims
if self.get_input_dims() > 2:
raise ValueError("cannot plot more than two input dimensions")
if self.get_input_dims() == 2:
raise NotImplementedError("two dimensional input data not yet implemented") # TODO
if ax is None:
_, ax = plt.subplots(1, 1, figsize=(12, 3.0), squeeze=True, constrained_layout=True)
X_scale = 1.0
if np.issubdtype(self.X[0].dtype, np.datetime64):
if per is None:
per = _datetime64_unit_names[self.X[0].get_time_unit()]
else:
unit = _parse_delta(per)
X_scale = np.timedelta64(1,self.X[0].get_time_unit()) / unit
if not isinstance(per, str):
per = '%s' % (unit,)
if per is not None:
ax.set_xlabel('Frequency [1/'+per+']')
else:
ax.set_xlabel('Frequency')
X = self.X[0].astype(np.float)
Y = self.Y
if transformed:
Y = self.Y.transformed
idx = np.argsort(X)
X = X[idx] * X_scale
Y = Y[idx]
nyquist = maxfreq
if nyquist is None:
dist = np.abs(X[1:]-X[:-1])
nyquist = 0.5 / np.average(dist)
X_freq = np.linspace(0.0, nyquist, 10001)[1:]
Y_freq_err = []
if method.lower() == 'ls':
Y_freq = signal.lombscargle(X*2.0*np.pi, Y, X_freq)
elif method.lower() == 'bnse':
bnse = bse(X, Y)
bnse.set_freqspace(nyquist, 10001)
bnse.train()
bnse.compute_moments()
Y_freq = bnse.post_mean_r**2 + bnse.post_mean_i**2
Y_freq_err = 2 * np.sqrt(np.diag(bnse.post_cov_r**2 + bnse.post_cov_i**2))
Y_freq = Y_freq[1:]
Y_freq_err = Y_freq_err[1:]
else:
raise ValueError('periodogram method "%s" does not exist' % (method))
ax.plot(X_freq, Y_freq, '-', c='k', lw=2)
if len(Y_freq_err) != 0:
ax.fill_between(X_freq, Y_freq-Y_freq_err, Y_freq+Y_freq_err, alpha=0.4)
ax.set_title((self.name + ' Spectrum' if self.name is not None else '') if title is None else title, fontsize=14)
xmin = X_freq.min()
xmax = X_freq.max()
ax.set_xlim(xmin - (xmax - xmin)*0.005, xmax + (xmax - xmin)*0.005)
ax.set_yticks([])
ax.set_ylim(0, None)
return ax
def _normalize_val(self, val):
# normalize input values for X axis, that is: expand to input_dims if a single value, convert values to appropriate dtype
if val is None:
return val
if isinstance(val, np.ndarray):
if val.ndim == 0:
val = [val.item()]
else:
val = list(val)
elif not isinstance(val, list):
val = [val] * self.get_input_dims()
if len(val) != self.get_input_dims():
raise ValueError("value must be a scalar or a list of values for each input dimension")
if not _is_homogeneous_type(val):
raise ValueError("value must have elements of the same type")
for i in range(self.get_input_dims()):
try:
val[i] = self.X[i].dtype.type(val[i])
except:
raise ValueError("value must be of type %s" % (self.X[i].dtype,))
return val
def _is_homogeneous_type(seq):
it = iter(seq)
first = type(next(it))
return all(type(x) is first for x in it)
def _check_function(f, input_dims, is_datetime64):
if not inspect.isfunction(f):
raise ValueError("function must take X as a parameter")
sig = inspect.signature(f)
if not len(sig.parameters) == 1:
raise ValueError("function must take X as a parameter")
if is_datetime64:
x = np.array([[np.datetime64('2000', 'us')] * input_dims])
else:
x = np.ones((1, input_dims))
y = f(x)
if len(y.shape) != 1 or y.shape[0] != 1:
raise ValueError("function must return Y with shape (n), note that X has shape (n,input_dims)")
_datetime64_unit_names = {
'Y': 'year',
'M': 'month',
'W': 'week',
'D': 'day',
'h': 'hour',
'm': 'minute',
's': 'second',
'ms': 'millisecond',
'us': 'microsecond',
}
duration_regex = re.compile(
r'^((?P<years>[\.\d]+?)y)?'
r'((?P<months>[\.\d]+?)M)?'
r'((?P<weeks>[\.\d]+?)W)?'
r'((?P<days>[\.\d]+?)D)?'
r'((?P<hours>[\.\d]+?)h)?'
r'((?P<minutes>[\.\d]+?)m)?'
r'((?P<seconds>[\.\d]+?)s)?$'
r'((?P<milliseconds>[\.\d]+?)ms)?$'
r'((?P<microseconds>[\.\d]+?)us)?$'
)
def _parse_delta(text):
if not isinstance(text, str):
return text
if text == 'year' or text == 'years':
return np.timedelta64(1, 'Y')
elif text == 'month' or text == 'months':
return np.timedelta64(1, 'M')
elif text == 'week' or text == 'weeks':
return np.timedelta64(1, 'W')
elif text == 'day' or text == 'days':
return np.timedelta64(1, 'D')
elif text == 'hour' or text == 'hours':
return np.timedelta64(1, 'h')
elif text == 'minute' or text == 'minutes':
return np.timedelta64(1, 'm')
elif text == 'second' or text == 'seconds':
return np.timedelta64(1, 's')
elif text == 'millisecond' or text == 'milliseconds':
return np.timedelta64(1, 'ms')
elif text == 'microsecond' or text == 'microseconds':
return np.timedelta64(1, 'us')
m = duration_regex.match(text)
if m is None:
raise ValueError('duration string must be of the form 2h45m, allowed characters: (Y)ear, (M)onth, (W)eek, (D)ay, (h)our, (m)inute, (s)econd, (ms) for milliseconds, (us) for microseconds')
delta = 0
matches = m.groupdict()
if matches['years']:
delta += np.timedelta64(np.int32(matches['years']), 'Y')
if matches['months']:
delta += np.timedelta64(np.int32(matches['months']), 'M')
if matches['weeks']:
delta += np.timedelta64(np.int32(matches['weeks']), 'W')
if matches['days']:
delta += np.timedelta64(np.int32(matches['days']), 'D')
if matches['hours']:
delta += np.timedelta64(np.int32(matches['hours']), 'h')
if matches['minutes']:
delta += np.timedelta64(np.int32(matches['minutes']), 'm')
if matches['seconds']:
delta += np.timedelta64(np.int32(matches['seconds']), 's')
if matches['milliseconds']:
delta += np.timedelta64(np.int32(matches['milliseconds']), 'ms')
if matches['microseconds']:
delta += np.timedelta64(np.int32(matches['microseconds']), 'us')
return delta
def _datetime64_to_higher_unit(array):
if array.dtype in ['<M8[Y]', '<M8[M]', '<M8[W]', '<M8[D]']:
return array
units = ['D', 'h', 'm', 's'] # cannot convert days to non-linear months or years
for unit in units:
frac, _ = np.modf((array-np.datetime64('2000')) / np.timedelta64(1,unit))
if not np.any(frac):
return array.astype('datetime64[%s]' % (unit,))
return array
def _timedelta64_to_higher_unit(array):
if array.dtype in ['<m8[Y]', '<m8[M]', '<m8[W]', '<m8[D]']:
return array
units = ['D', 'h', 'm', 's'] # cannot convert days to non-linear months or years
for unit in units:
_, intg = np.modf(array / np.timedelta64(1,unit))
if np.any(intg):
return array.astype('timedelta64[%s]' % (unit,))
return array
Functions
def LoadFunction(f, start, end, n, var=0.0, name='', random=False)
-
LoadFunction loads a dataset from a given function y = f(x) + Normal(0,var). It will pick
n
data points between start and end for the X axis for whichf
is being evaluated. By default then
points are spread equally over the interval, withrandom=True
they will be picked randomly.The given function should take one argument X of shape (n,input_dims) and return Y of shape (n,). If your data has only one input dimension, you can use X[:,0] to select the first (and only) input dimension.
Args
f
:function
- Function taking X with shape (n,input_dims) and returning shape (n,) as Y.
start
:float
,list
- Define start of interval.
end
:float
,list
- Define end of interval.
n
:int
- Number of data points to pick between start and end.
var
:float
- Variance added to the output.
name
:str
- Name of data.
random
:boolean
- Select points randomly between start and end.
Returns
Data
Examples
>>> LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave') <mogptk.data.Data at ...>
Expand source code Browse git
def LoadFunction(f, start, end, n, var=0.0, name="", random=False): """ LoadFunction loads a dataset from a given function y = f(x) + Normal(0,var). It will pick `n` data points between start and end for the X axis for which `f` is being evaluated. By default the `n` points are spread equally over the interval, with `random=True` they will be picked randomly. The given function should take one argument X of shape (n,input_dims) and return Y of shape (n,). If your data has only one input dimension, you can use X[:,0] to select the first (and only) input dimension. Args: f (function): Function taking X with shape (n,input_dims) and returning shape (n,) as Y. start (float, list): Define start of interval. end (float, list): Define end of interval. n (int): Number of data points to pick between start and end. var (float): Variance added to the output. name (str): Name of data. random (boolean): Select points randomly between start and end. Returns: mogptk.data.Data Examples: >>> LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave') <mogptk.data.Data at ...> """ if type(start) is not type(end): raise ValueError("start and end must be of the same type") if isinstance(start, np.ndarray): if start.ndim == 0: start = [start.item()] end = [end.item()] else: start = list(start) end = list(end) if not isinstance(start, list): start = [start] end = [end] if len(start) != len(end): raise ValueError("start and end must be of the same length") if not _is_homogeneous_type(start + end): raise ValueError("start and end must have elements of the same type") if isinstance(start[0], datetime.datetime) or isinstance(start[0], str) or isinstance(start[0], np.datetime64): # convert datetime.datetime or strings to np.datetime64 for i in range(len(start)): try: start[i] = np.datetime64(start[i], 'us') end[i] = np.datetime64(end[i], 'us') except: raise ValueError("start and end must have a number or datetime data type") else: for i in range(len(start)): try: start[i] = np.float64(start[i]) end[i] = np.float64(end[i]) except: raise ValueError("start and end must have a number or datetime data type") input_dims = len(start) is_datetime64 = isinstance(start[0], np.datetime64) _check_function(f, input_dims, is_datetime64) if is_datetime64: if random: raise ValueError("cannot use random for datetime inputs") x = np.empty((n, input_dims), dtype=start[0].dtype) else: x = np.empty((n, input_dims)) for i in range(input_dims): if start[i] >= end[i]: if input_dims == 1: raise ValueError("start must be lower than end") else: raise ValueError("start must be lower than end for input dimension %d" % (i,)) if is_datetime64: dt = (end[i]-start[i]) / float(n-1) dt = _timedelta64_to_higher_unit(dt) x[:,i] = np.arange(start[i], start[i]+dt*(n-1)+np.timedelta64(1,'us'), dt) elif random: x[:,i] = np.random.uniform(start[i], end[i], n) else: x[:,i] = np.linspace(start[i], end[i], n) y = f(x) if y.ndim == 2 and y.shape[1] == 1: y = y[:,0] y += np.random.normal(0.0, var, n) data = Data(x, y, name=name) data.set_function(f) return data
Classes
class Data (X, Y, name=None, x_labels=None, y_label=None)
-
Data class that holds all observations, latent functions and predicted data.
This class accepts the data directly, otherwise you can load data conveniently using
LoadFunction()
,LoadCSV
,LoadDataFrame
, etc. The data class allows to modify the data before passing into the model. Examples are transforming data, such as detrending or taking the log, removing data ranges to simulate sensor failure, and aggregating data for given spans on X, such as aggregating daily data into weekly data. Additionally, we also use this class to set the range we want to predict.It is possible to use the format given by
numpy.meshgrid
for X and its values in Y.Args
X
:list
,numpy.ndarray
,dict
- Independent variable data of shape (n,) or (n,input_dims).
Y
:list
,numpy.ndarray
- Dependent variable data of shape (n,).
name
:str
- Name of data.
x_labels
:str
,list
ofstr
- Name or names of input dimensions.
y_label
:str
- Name of output dimension.
Examples
>>> channel = mogptk.Data([0, 1, 2, 3], [4, 3, 5, 6])
Expand source code Browse git
class Data: def __init__(self, X, Y, name=None, x_labels=None, y_label=None): """ Data class that holds all observations, latent functions and predicted data. This class accepts the data directly, otherwise you can load data conveniently using `LoadFunction`, `LoadCSV`, `LoadDataFrame`, etc. The data class allows to modify the data before passing into the model. Examples are transforming data, such as detrending or taking the log, removing data ranges to simulate sensor failure, and aggregating data for given spans on X, such as aggregating daily data into weekly data. Additionally, we also use this class to set the range we want to predict. It is possible to use the format given by `numpy.meshgrid` for X and its values in Y. Args: X (list, numpy.ndarray, dict): Independent variable data of shape (n,) or (n,input_dims). Y (list, numpy.ndarray): Dependent variable data of shape (n,). name (str): Name of data. x_labels (str, list of str): Name or names of input dimensions. y_label (str): Name of output dimension. Examples: >>> channel = mogptk.Data([0, 1, 2, 3], [4, 3, 5, 6]) """ # convert dicts to lists if x_labels is not None: if isinstance(x_labels, str): x_labels = [x_labels] if not isinstance(x_labels, list) or not all(isinstance(label, str) for label in x_labels): raise ValueError("x_labels must be a string or list of strings for each input dimension") if isinstance(X, dict): it = iter(X.values()) first = len(next(it)) if not all(isinstance(x, (list, np.ndarray)) for x in X.values()) or not all(len(x) == first for x in it): raise ValueError("X dict should contain all lists or np.ndarrays where each has the same length") if not all(key in X for key in x_labels): raise ValueError("X dict must contain all keys listed in x_labels") X = list(map(list, zip(*[X[key] for key in x_labels]))) # check if X and Y are correct inputs if isinstance(X, list): if all(isinstance(x, list) for x in X): m = len(X[0]) if not all(len(x) == m for x in X[1:]): raise ValueError("X list items must all be lists of the same length") if not all(all(isinstance(val, (int, float, datetime.datetime, np.datetime64)) for val in x) for x in X): raise ValueError("X list items must all be lists of numbers or datetime") if not _is_homogeneous_type(x): raise ValueError("X list items must all be lists with elements of the same type") elif all(isinstance(x, np.ndarray) for x in X): m = len(X[0]) if not all(len(x) == m for x in X[1:]): raise ValueError("X list items must all be numpy.ndarrays of the same length") elif not all(isinstance(x, (int, float, datetime.datetime, np.datetime64)) for x in X): raise ValueError("X list items must be all lists, all numpy.ndarrays, or all numbers or datetime") elif not _is_homogeneous_type(X): raise ValueError("X list items must all have elements of the same type") X = np.array(X) if isinstance(Y, list): if not all(isinstance(y, (int, float)) for y in Y): raise ValueError("Y list items must all be numbers") elif not _is_homogeneous_type(Y): raise ValueError("Y list items must all have elements of the same type") Y = np.array(Y) if not isinstance(X, np.ndarray) or not isinstance(Y, np.ndarray): raise ValueError("X and Y must be lists or numpy arrays, if dicts are passed then x_labels and/or y_label must also be set") # try to cast unknown data types, X becomes np.float64 or np.datetime64, and Y becomes mp.float64 if X.dtype == np.object_ or np.issubdtype(X.dtype, np.character): # convert datetime.datetime or strings to np.datetime64 try: X = X.astype(np.datetime64) except: raise ValueError("X data must have a number or datetime data type") elif not np.issubdtype(X.dtype, np.datetime64): try: X = X.astype(np.float64) except: raise ValueError("X data must have a number or datetime data type") try: Y = Y.astype(np.float64) except: raise ValueError("Y data must have a number data type") # convert X datetime64[us] to a higher unit like s, m, h, D, ... if np.issubdtype(X.dtype, np.datetime64): X = _datetime64_to_higher_unit(X) # convert meshgrids to flat arrays if 2 < X.ndim and 1 < Y.ndim and X.shape[1:] == Y.shape: X = np.vstack(list(map(np.ravel, X))).T Y = np.ravel(Y) if X.ndim == 1: X = X.reshape(-1, 1) if X.ndim != 2: raise ValueError("X must be either a one or two dimensional array of data") if Y.ndim != 1: raise ValueError("Y must be a one dimensional array of data") if X.shape[0] != Y.shape[0]: raise ValueError("X and Y must be of the same length") if Y.shape[0] == 0: raise ValueError("X and Y must have a length greater than zero") input_dims = X.shape[1] self.X = [Serie(X[:,i]) for i in range(input_dims)] # [shape (n)] * input_dims self.Y = Serie(Y) # shape (n) self.mask = np.array([True] * Y.shape[0]) self.F = None self.X_pred = self.X self.Y_mu_pred = {} self.Y_var_pred = {} self.removed_ranges = [[]] * input_dims self.X_labels = ['X'] * input_dims if 1 < input_dims: for i in range(input_dims): self.X_labels[i] = 'X%d' % (i,) if isinstance(x_labels, list) and all(isinstance(item, str) for item in x_labels): self.X_labels = x_labels self.name = None if isinstance(name, str): self.name = name elif isinstance(y_label, str): self.name = y_label self.Y_label = 'Y' if isinstance(y_label, str): self.Y_label = y_label def __repr__(self): df = pd.DataFrame() for i in range(len(self.X)): df[self.X_labels[i]] = self.X[i] df[self.Y_label] = self.Y return repr(df) def copy(self): """ Make a deep copy of `Data`. Returns: mogptk.data.Data Examples: >>> other = data.copy() """ return copy.deepcopy(self) def set_name(self, name): """ Set name for data channel. Args: name (str): Name of data. Examples: >>> data.set_name('Channel A') """ self.name = name def set_labels(self, x_labels, y_label): """ Set axis labels for plots. Args: x_labels (str, list of str): X data names for each input dimension. y_label (str): Y data name for output dimension. Examples: >>> data.set_labels(['X', 'Y'], 'Cd') """ if isinstance(x_labels, str): x_labels = [x_labels] elif not isinstance(x_labels, list) or not all(isinstance(item, str) for item in x_labels): raise ValueError("x_labels must be list of strings") if not isinstance(y_label, str): raise ValueError("y_label must be string") if len(x_labels) != self.get_input_dims(): raise ValueError("x_labels must have the same input dimensions as the data") self.X_labels = x_labels self.Y_label = y_label def set_function(self, f): """ Set the (latent) function for the data, ie. the theoretical or true signal. This is used for plotting purposes and is optional. The function should take one argument X of shape (n,input_dims) and return Y of shape (n,). If your data has only one input dimension, you can use X[:,0] to select the first (and only) input dimension. Args: f (function): Function taking X with shape (n,input_dims) and returning shape (n,) as Y. Examples: >>> data.set_function(lambda x: np.sin(3*x[:,0]) """ _check_function(f, self.get_input_dims(), self.X[0].is_datetime64()) self.F = f def rescale_x(self, upper=1000.0): """ Rescale the X axis so that it is in the interval [0.0,upper]. This helps training most kernels. Args: upper (float): Upper end of the interval. Examples: >>> data.rescale_x() """ for i in range(self.get_input_dims()): X = self.X[i].transformed xmin = np.min(X) xmax = np.max(X) t = TransformLinear(xmin, (xmax-xmin)/upper) t.set_data(X) self.X[i].apply(t) def transform(self, transformer): """ Transform the Y axis data by using one of the provided transformers, such as `TransformDetrend`, `TransformLinear`, `TransformLog`, `TransformNormalize`, `TransformStandard`, etc. Args: transformer (obj): Transformer object derived from TransformBase. Examples: >>> data.transform(mogptk.TransformDetrend(degree=2)) # remove polynomial trend >>> data.transform(mogptk.TransformLinear(slope=1, bias=2)) # remove linear trend >>> data.transform(mogptk.TransformLog) # log transform the data >>> data.transform(mogptk.TransformNormalize) # transform to [-1,1] >>> data.transform(mogptk.TransformStandard) # transform to mean=0, var=1 """ t = transformer if isinstance(t, type): t = transformer() else: t = copy.deepcopy(t) t.set_data(self) self.Y.apply(t, np.array([x for x in self.X]).T) def filter(self, start, end): """ Filter the data range to be between `start` and `end` in the X axis. Args: start (float, str): Start of interval. end (float, str): End of interval. Examples: >>> data.filter(3, 8) >>> data.filter('2016-01-15', '2016-06-15') """ if self.get_input_dims() != 1: raise ValueError("can only filter on one dimensional input data") start = self._normalize_val(start) end = self._normalize_val(end) ind = (self.X[0] >= start[0]) & (self.X[0] < end[0]) self.X[0] = self.X[0][ind] self.Y = self.Y[ind] self.mask = self.mask[ind] def aggregate(self, duration, f=np.mean): """ Aggregate the data by duration and apply a function to obtain a reduced dataset. For example, group daily data by week and take the mean. The duration can be set as a number which defined the intervals on the X axis, or by a string written in the duration format in case the X axis has data type `numpy.datetime64`. The duration format uses: Y=year, M=month, W=week, D=day, h=hour, m=minute, and s=second. For example, 3W1D means three weeks and one day, ie. 22 days, or 6M to mean six months. Args: duration (float, str): Duration along the X axis or as a string in the duration format. f (function): Function to use to reduce data. Examples: >>> data.aggregate(5) >>> data.aggregate('2W', f=np.sum) """ if self.get_input_dims() != 1: raise ValueError("can only aggregate on one dimensional input data") start = np.min(self.X[0]) end = np.max(self.X[0]) step = _parse_delta(duration) X = np.arange(start+step/2, end+step/2, step) Y = np.empty((len(X))) for i in range(len(X)): ind = (self.X[0] >= X[i]-step/2) & (self.X[0] < X[i]+step/2) Y[i] = f(self.Y[ind]) self.X = [Serie(X, self.X[0].transformers)] self.Y = Serie(Y, self.Y.transformers, np.array([x for x in self.X]).T) self.mask = np.array([True] * len(self.X[0])) ################################################################ def get_name(self): """ Return the name of the channel. Returns: str Examples: >>> data.get_name() 'A' """ return self.name def has_test_data(self): """ Returns True if observations have been removed using the `remove_*` methods. Returns: boolean Examples: >>> data.has_test_data() True """ return False in self.mask def get_input_dims(self): """ Returns the number of input dimensions. Returns: int: Number of input dimensions. Examples: >>> data.get_input_dims() 2 """ return len(self.X) def get_data(self, transformed=False): """ Returns all observations, train and test. Arguments: transformed (boolean): Return transformed data. Returns: numpy.ndarray: X data of shape (n,input_dims). numpy.ndarray: Y data of shape (n,). Examples: >>> x, y = data.get_data() """ if transformed: return np.array([x for x in self.X]).T, self.Y.transformed return np.array([x for x in self.X]).T, np.array(self.Y) def get_train_data(self, transformed=False): """ Returns the observations used for training. Arguments: transformed (boolean): Return transformed data. Returns: numpy.ndarray: X data of shape (n,input_dims). numpy.ndarray: Y data of shape (n,). Examples: >>> x, y = data.get_train_data() """ if transformed: return np.array([x[self.mask] for x in self.X]).T, self.Y.transformed[self.mask] return np.array([x[self.mask] for x in self.X]).T, np.array(self.Y[self.mask]) def get_test_data(self, transformed=False): """ Returns the observations used for testing which correspond to the removed points. Arguments: transformed (boolean): Return transformed data. Returns: numpy.ndarray: X data of shape (n,input_dims). numpy.ndarray: Y data of shape (n,). Examples: >>> x, y = data.get_test_data() """ X = np.array([x[~self.mask] for x in self.X]).T if self.F is not None: if X.shape[0] == 0: X, _ = self.get_data() Y = self.F(X) if transformed: Y = self.Y.transform(Y, X) return X, Y if transformed: return X, self.Y.transformed[~self.mask] return X, np.array(self.Y[~self.mask]) ################################################################ def reset(self): """ Reset the data set and undo the removal of data points. That is, this reverts any calls to `remove_randomly`, `remove_range`, `remove_relative_range`, `remove_random_ranges`, and `remove_index`. """ self.mask[:] = True for i in range(len(self.removed_ranges)): self.removed_ranges[i] = [] def remove_randomly(self, n=None, pct=None): """ Removes observations randomly on the whole range. Either `n` observations are removed, or a percentage of the observations. Args: n (int): Number of observations to remove randomly. pct (float): Percentage in interval [0,1] of observations to remove randomly. Examples: >>> data.remove_randomly(50) # remove 50 observations >>> data.remove_randomly(pct=0.9) # remove 90% of the observations """ if n is None: if pct is None: n = 0 else: n = int(pct * len(self.Y)) idx = np.random.choice(len(self.Y), n, replace=False) self.mask[idx] = False def remove_range(self, start=None, end=None): """ Removes observations in the interval `[start,end]`. Args: start (float, str): Start of interval. Defaults to the first value in observations. end (float, str): End of interval. Defaults to the last value in observations. Examples: >>> data = mogptk.LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave') >>> data.remove_range(3, 8) >>> data = mogptk.LoadCSV('gold.csv', 'Date', 'Price') >>> data.remove_range('2016-01-15', '2016-06-15') """ if self.get_input_dims() != 1: raise ValueError("can only remove ranges on one dimensional input data") if start is None: start = np.min(self.X[0]) if end is None: end = np.max(self.X[0]) start = self._normalize_val(start) end = self._normalize_val(end) idx = np.where(np.logical_and(self.X[0] >= start[0], self.X[0] <= end[0])) self.mask[idx] = False self.removed_ranges[0].append([start[0], end[0]]) def remove_relative_range(self, start=0.0, end=1.0): """ Removes observations between `start` and `end` as a percentage of the number of observations. So `0` is the first observation, `0.5` is the middle observation, and `1` is the last observation. Args: start (float): Start percentage in interval [0,1]. end (float): End percentage in interval [0,1]. """ if self.get_input_dims() != 1: raise ValueError("can only remove ranges on one dimensional input data") start = self._normalize_val(start) end = self._normalize_val(end) x_min = np.min(self.X[0]) x_max = np.max(self.X[0]) for i in range(self.get_input_dims()): start[i] = x_min + max(0.0, min(1.0, start[i])) * (x_max-x_min) end[i] = x_min + max(0.0, min(1.0, end[i])) * (x_max-x_min) idx = np.where(np.logical_and(self.X[0] >= start[0], self.X[0] <= end[0])) self.mask[idx] = False self.removed_ranges[0].append([start[0], end[0]]) def remove_random_ranges(self, n, duration): """ Removes a number of ranges to simulate sensor failure. May remove fewer ranges if there is no more room to remove a range in the remaining data. Args: n (int): Number of ranges to remove. duration (float, str): Width of ranges to remove, can use a number or the duration format syntax (see aggregate()). Examples: >>> data.remove_random_ranges(2, 5) # remove two ranges that are 5 wide in input space >>> data.remove_random_ranges(3, '1d') # remove three ranges that are 1 day wide """ if self.get_input_dims() != 1: raise ValueError("can only remove ranges on one dimensional input data") if n < 1: return delta = _parse_delta(duration) m = (np.max(self.X[0])-np.min(self.X[0])) - n*delta if m <= 0: raise ValueError("no data left after removing ranges") locs = self.X[0] <= (np.max(self.X[0])-delta) locs[sum(locs)] = True # make sure the last data point can be deleted for i in range(n): if len(self.X[0][locs]) == 0: break # range could not be removed, there is no remaining data range of width delta x = self.X[0][locs][np.random.randint(len(self.X[0][locs]))] locs[(self.X[0] > x-delta) & (self.X[0] < x+delta)] = False self.mask[(self.X[0] >= x) & (self.X[0] < x+delta)] = False self.removed_ranges[0].append([x, x+delta]) def remove_index(self, index): """ Removes observations of given index Args: index(list, numpy.ndarray): Array of indexes of the data to remove. """ if isinstance(index, list): index = np.array(index) elif not isinstance(index, np.ndarray): raise ValueError("index must be list or numpy array") self.mask[index] = False ################################################################ def get_prediction_names(self): """ Returns the model names of the saved predictions. Returns: list: List of prediction names. Examples: >>> data.get_prediction_names() ['MOSM', 'CSM', 'SM-LMC', 'CONV'] """ return self.Y_mu_pred.keys() def get_prediction_x(self): """ Returns the prediction X range. Returns: numpy.ndarray: X prediction of shape (n,input_dims). Examples: >>> x = data.get_prediction_x() """ return np.array([x for x in self.X_pred]).T def get_prediction(self, name, sigma=2.0, transformed=False): """ Returns the prediction of a given name with a confidence interval of `sigma` times the standard deviation. Args: name (str): Name of the model of the prediction. sigma (float): The confidence interval's number of standard deviations. transformed (boolean): Return transformed data as used for training. Returns: numpy.ndarray: X prediction of shape (n,input_dims). numpy.ndarray: Y mean prediction of shape (n,). numpy.ndarray: Y lower prediction of uncertainty interval of shape (n,). numpy.ndarray: Y upper prediction of uncertainty interval of shape (n,). Examples: >>> x, y_mean, y_var_lower, y_var_upper = data.get_prediction('MOSM', sigma=1) """ if name not in self.Y_mu_pred: raise ValueError("prediction name '%s' does not exist" % (name)) X = np.array([x for x in self.X_pred]).T mu = self.Y_mu_pred[name] lower = mu - sigma * np.sqrt(self.Y_var_pred[name]) upper = mu + sigma * np.sqrt(self.Y_var_pred[name]) if transformed: return X, mu, lower, upper mu = Serie(self.Y.detransform(mu, X), self.Y.transformers, transformed=mu) lower = Serie(self.Y.detransform(lower, X), self.Y.transformers, transformed=lower) upper = Serie(self.Y.detransform(upper, X), self.Y.transformers, transformed=upper) return X, mu, lower, upper def set_prediction_x(self, X): """ Set the prediction range directly for saved predictions. This will clear old predictions. Args: X (list, numpy.ndarray): Array of shape (n,) or (n,input_dims) used for predictions. Examples: >>> data.set_prediction_x([5.0, 5.5, 6.0, 6.5, 7.0]) """ # TODO: accept multiple input dims and make sure dtype equals X if isinstance(X, list): X = np.array(X) elif not isinstance(X, np.ndarray): raise ValueError("X expected to be a list or numpy.ndarray") X = X.astype(np.float64) if X.ndim == 1: X = X.reshape(-1, 1) if X.ndim != 2 or X.shape[1] != self.get_input_dims(): raise ValueError("X shape must be (n,input_dims)") self.X_pred = [Serie(X[:,i], self.X[i].transformers) for i in range(self.get_input_dims())] # clear old prediction data now that X_pred has been updated self.clear_predictions() def set_prediction_range(self, start=None, end=None, n=None, step=None): """ Sets the prediction range. The interval is set as `[start,end]`, with either `n` points or a given step between the points. Args: start (float, str): Start of interval, defaults to the first observation. end (float, str): End of interval, defaults to the last observation. n (int): Number of points to generate in the interval. step (float, str): Spacing between points in the interval. If neither step or n is passed, default number of points is 100. Examples: >>> data = mogptk.LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave') >>> data.set_prediction_range(3, 8, 200) >>> data = mogptk.LoadCSV('gold.csv', 'Date', 'Price') >>> data.set_prediction_range('2016-01-15', '2016-06-15', step='1d') """ # TODO: accept multiple input dims and make sure dtype equals X if self.get_input_dims() != 1: raise ValueError("can only set prediction range on one dimensional input data") if start is None: start = [x[0] for x in self.X] if end is None: start = [x[-1] for x in self.X] start = self._normalize_val(start) end = self._normalize_val(end) # TODO: works for multi input dims? if end <= start: raise ValueError("start must be lower than end") # TODO: prediction range for multi input dimension; fix other axes to zero so we can plot? X_pred = [np.array([])] * self.get_input_dims() if step is None and n is not None: for i in range(self.get_input_dims()): X_pred[i] = start[i] + (end[i]-start[i])*np.linspace(0.0, 1.0, n) else: if self.get_input_dims() != 1: raise ValueError("cannot use step for multi dimensional input, use n") if step is None: step = (end[0]-start[0])/100 else: step = _parse_delta(step) X_pred[0] = np.arange(start[0], end[0]+step, step) self.X_pred = [Serie(x, self.X[i].transformers) for i, x in enumerate(X_pred)] # clear old prediction data now that X_pred has been updated self.clear_predictions() def clear_predictions(self): """ Clear all saved predictions. """ self.Y_mu_pred = {} self.Y_var_pred = {} ################################################################ def get_nyquist_estimation(self): """ Estimate the Nyquist frequency by taking 0.5/(minimum distance of points). Returns: numpy.ndarray: Nyquist frequency array of shape (input_dims,). Examples: >>> freqs = data.get_nyquist_estimation() """ input_dims = self.get_input_dims() nyquist = np.empty((input_dims)) for i in range(self.get_input_dims()): x = np.sort(self.X[i].transformed[self.mask]) dist = np.abs(x[1:]-x[:-1]) dist = np.min(dist[np.nonzero(dist)]) nyquist[i] = 0.5/dist return nyquist def get_lombscargle_estimation(self, Q=1, n=10000): """ Peak estimation of the spectrum using Lomb-Scargle. Args: Q (int): Number of peaks to find. n (int): Number of points to use for Lomb-Scargle. Returns: numpy.ndarray: Amplitude array of shape (Q,input_dims). numpy.ndarray: Frequency array of shape (Q,input_dims). numpy.ndarray: Variance array of shape (Q,input_dims). Examples: >>> amplitudes, means, variances = data.get_lombscargle_estimation() """ input_dims = self.get_input_dims() # Gaussian: f(x) = A * exp((x-B)^2 / (2C^2)) # i.e. A is the amplitude or peak height, B the mean or peak position, and C the std.dev. or peak width A = np.zeros((Q, input_dims)) B = np.zeros((Q, input_dims)) C = np.zeros((Q, input_dims)) nyquist = self.get_nyquist_estimation() for i in range(input_dims): x, y = np.array([x.transformed[self.mask] for x in self.X]).T, self.Y.transformed[self.mask] freq = np.linspace(0, nyquist[i], n+1)[1:] psd = signal.lombscargle(x[:,i]*2.0*np.pi, y, freq) ind, _ = signal.find_peaks(psd) ind = ind[np.argsort(psd[ind])[::-1]] # sort by biggest peak first widths, width_heights, _, _ = signal.peak_widths(psd, ind, rel_height=0.5) widths *= freq[1]-freq[0] positions = freq[ind] amplitudes = psd[ind] # from full-width half-maximum to Gaussian sigma # note that amplitudes / width_heights is near 2 when the base of the peak is near zero stddevs = widths / np.sqrt(8 * np.log(amplitudes / width_heights)) if Q < len(amplitudes): amplitudes = amplitudes[:Q] positions = positions[:Q] stddevs = stddevs[:Q] n = len(amplitudes) A[:n,i] = np.sqrt(amplitudes) B[:n,i] = positions C[:n,i] = stddevs return A, B, C def get_bnse_estimation(self, Q=1, n=1000): """ Peak estimation of the spectrum using BNSE (Bayesian Non-parametric Spectral Estimation). Args: Q (int): Number of peaks to find. n (int): Number of points of the grid to evaluate frequencies. Returns: numpy.ndarray: Amplitude array of shape (Q,input_dims). numpy.ndarray: Frequency array of shape (Q,input_dims). numpy.ndarray: Variance array of shape (Q,input_dims). Examples: >>> amplitudes, means, variances = data.get_bnse_estimation() """ input_dims = self.get_input_dims() # Gaussian: f(x) = A * exp((x-B)^2 / (2C^2)) # Ie. A is the amplitude or peak height, B the mean or peak position, and C the variance or peak width A = np.zeros((Q, input_dims)) B = np.zeros((Q, input_dims)) C = np.zeros((Q, input_dims)) nyquist = self.get_nyquist_estimation() for i in range(input_dims): x, y = np.array([x.transformed[self.mask] for x in self.X]).T, self.Y.transformed[self.mask] bnse = bse(x[:,i], y) bnse.set_freqspace(nyquist[i], dimension=n) bnse.train() bnse.compute_moments() amplitudes, positions, variances = bnse.get_freq_peaks() if len(positions) == 0: continue if Q < len(amplitudes): amplitudes = amplitudes[:Q] positions = positions[:Q] variances = variances[:Q] num = len(amplitudes) # division by 100 makes it similar to other estimators (emperically found) A[:num,i] = np.sqrt(amplitudes) / 100 B[:num,i] = positions C[:num,i] = variances return A, B, C def get_sm_estimation(self, Q=1, method='BNSE', optimizer='Adam', iters=100, params={}, plot=False): """ Peak estimation of the spectrum using the spectral mixture kernel. Args: Q (int): Number of peaks to find. method (str): Method of estimating SM kernels. optimizer (str): Optimization method for SM kernels. iters (str): Maximum iteration for SM kernels. params (object): Additional parameters for the PyTorch optimizer. plot (bool): Show the PSD of the kernel after fitting. Returns: numpy.ndarray: Amplitude array of shape (Q,input_dims). numpy.ndarray: Frequency array of shape (Q,input_dims). numpy.ndarray: Variance array of shape (Q,input_dims). Examples: >>> amplitudes, means, variances = data.get_sm_estimation() """ from .sm import SM input_dims = self.get_input_dims() # Gaussian: f(x) = A * exp((x-B)^2 / (2C^2)) # Ie. A is the amplitude or peak height, B the mean or peak position, and C the variance or peak width A = np.zeros((Q, input_dims)) B = np.zeros((Q, input_dims)) C = np.zeros((Q, input_dims)) sm = SM(self, Q) sm.init_parameters(method) sm.train(method=optimizer, iters=iters, **params) if plot: nyquist = self.get_nyquist_estimation() means = np.array([sm.model.kernel[0][q].mean() for q in range(Q)]) weights = np.array([sm.model.kernel[0][q].weight() for q in range(Q)]) scales = np.array([sm.model.kernel[0][q].variance() for q in range(Q)]) plot_spectrum(means, scales, weights=weights, nyquist=nyquist, title=self.name) for q in range(Q): A[q,:] = sm.kernel[0][q].weight.numpy() # TODO: weight is not per input_dims B[q,:] = sm.kernel[0][q].mean.numpy() C[q,:] = sm.kernel[0][q].variance.numpy() return A, B, C def plot(self, pred=None, title=None, ax=None, legend=True, transformed=False): """ Plot the data including removed observations, latent function, and predictions. Args: pred (str): Specify model name to draw. title (str): Set the title of the plot. ax (matplotlib.axes.Axes): Draw to this axes, otherwise draw to the current axes. legend (boolean): Display legend. transformed (boolean): Display transformed Y data as used for training. Returns: matplotlib.figure.Figure: only if `ax` is not set matplotlib.axes.Axes: only if `ax` is not set Examples: >>> ax = data.plot() """ # TODO: ability to plot conditional or marginal distribution to reduce input dims if self.get_input_dims() > 2: raise ValueError("cannot plot more than two input dimensions") if self.get_input_dims() == 2: raise NotImplementedError("two dimensional input data not yet implemented") # TODO fig = None if ax is None: fig, ax = plt.subplots(1, 1, figsize=(12, 3.0), squeeze=True, constrained_layout=True) legends = [] colors = list(matplotlib.colors.TABLEAU_COLORS) for i, name in enumerate(self.Y_mu_pred): if self.Y_mu_pred[name].size != 0 and (pred is None or name.lower() == pred.lower()): X_pred, mu, lower, upper = self.get_prediction(name, transformed=transformed) idx = np.argsort(X_pred[:,0]) ax.plot(X_pred[:,0][idx], mu[idx], ls='-', color=colors[i], lw=2) ax.fill_between(X_pred[:,0][idx], lower[idx], upper[idx], color=colors[i], alpha=0.1) #ax.plot(X_pred[:,0][idx], lower[idx], ls='-', color=colors[i], lw=1, alpha=0.5) #ax.plot(X_pred[:,0][idx], upper[idx], ls='-', color=colors[i], lw=1, alpha=0.5) label = 'Prediction' if name is not None: label += ' ' + name legends.append(plt.Line2D([0], [0], ls='-', color=colors[i], lw=2, label=label)) if self.F is not None: xmin = min(np.min(self.X[0]), np.min(self.X_pred[0])) xmax = max(np.max(self.X[0]), np.max(self.X_pred[0])) if np.issubdtype(self.X[0].dtype, np.datetime64): dt = np.timedelta64(1,self.X[0].get_time_unit()) n = int((xmax-xmin) / dt) + 1 x = np.empty((n, 1), dtype=self.X[0].dtype) x[:,0] = np.arange(xmin, xmax+np.timedelta64(1,'us'), dt) else: n = len(self.X[0])*10 x = np.empty((n, 1)) x[:,0] = np.linspace(xmin, xmax, n) y = self.F(x) if transformed: y = self.Y.transform(y, x) ax.plot(x[:,0], y, 'r--', lw=1) legends.append(plt.Line2D([0], [0], ls='--', color='r', label='True')) _, Y = self.get_data(transformed=transformed) idx = np.argsort(self.X[0]) ax.plot(self.X[0][idx], Y[idx], 'k--', alpha=0.8) legends.append(plt.Line2D([0], [0], ls='--', color='k', label='All Points')) x, y = self.get_train_data(transformed=transformed) if 1000 < x.shape[0]: ax.plot(x[:,0], y, 'k-') legends.append(plt.Line2D([0], [0], ls='-', color='k', label='Training Points')) else: ax.plot(x[:,0], y, 'k.', mew=1, ms=13, markeredgecolor='white') legends.append(plt.Line2D([0], [0], ls='', color='k', marker='.', ms=10, label='Training Points')) if self.has_test_data(): for removed_range in self.removed_ranges[0]: x0 = removed_range[0] x1 = removed_range[1] y0 = ax.get_ylim()[0] y1 = ax.get_ylim()[1] ax.add_patch(patches.Rectangle( (x0, y0), x1-x0, y1-y0, fill=True, color='xkcd:strawberry', alpha=0.2, lw=0, )) legends.append(patches.Rectangle( (1, 1), 1, 1, fill=True, color='xkcd:strawberry', alpha=0.5, lw=0, label='Removed Ranges' )) xmin = min(self.X[0].min(), self.X_pred[0].min()) xmax = max(self.X[0].max(), self.X_pred[0].max()) ax.set_xlim(xmin - (xmax - xmin)*0.001, xmax + (xmax - xmin)*0.001) ax.set_xlabel(self.X_labels[0]) ax.set_ylabel(self.Y_label) ax.set_title(self.name if title is None else title, fontsize=14) if legend: legend_rows = (len(legends)-1)/5 + 1 ax.legend(handles=legends, loc="upper center", bbox_to_anchor=(0.5,(3.0+0.5+0.3*legend_rows)/3.0), ncol=5) if fig is not None: return fig, ax def plot_spectrum(self, title=None, method='ls', ax=None, per=None, maxfreq=None, transformed=False): """ Plot the spectrum of the data. Args: title (str): Set the title of the plot. method (str): Set the method to get the spectrum such as LS or BNSE. ax (matplotlib.axes.Axes): Draw to this axes, otherwise draw to the current axes. per (str, float, numpy.timedelta64): Set the scale of the X axis depending on the formatter used, eg. per=5, per='day', or per='3D'. maxfreq (float): Maximum frequency to plot, otherwise the Nyquist frequency is used. transformed (boolean): Display transformed Y data as used for training. Returns: matplotlib.axes.Axes Examples: >>> ax = data.plot_spectrum(method='bnse') """ # TODO: ability to plot conditional or marginal distribution to reduce input dims if self.get_input_dims() > 2: raise ValueError("cannot plot more than two input dimensions") if self.get_input_dims() == 2: raise NotImplementedError("two dimensional input data not yet implemented") # TODO if ax is None: _, ax = plt.subplots(1, 1, figsize=(12, 3.0), squeeze=True, constrained_layout=True) X_scale = 1.0 if np.issubdtype(self.X[0].dtype, np.datetime64): if per is None: per = _datetime64_unit_names[self.X[0].get_time_unit()] else: unit = _parse_delta(per) X_scale = np.timedelta64(1,self.X[0].get_time_unit()) / unit if not isinstance(per, str): per = '%s' % (unit,) if per is not None: ax.set_xlabel('Frequency [1/'+per+']') else: ax.set_xlabel('Frequency') X = self.X[0].astype(np.float) Y = self.Y if transformed: Y = self.Y.transformed idx = np.argsort(X) X = X[idx] * X_scale Y = Y[idx] nyquist = maxfreq if nyquist is None: dist = np.abs(X[1:]-X[:-1]) nyquist = 0.5 / np.average(dist) X_freq = np.linspace(0.0, nyquist, 10001)[1:] Y_freq_err = [] if method.lower() == 'ls': Y_freq = signal.lombscargle(X*2.0*np.pi, Y, X_freq) elif method.lower() == 'bnse': bnse = bse(X, Y) bnse.set_freqspace(nyquist, 10001) bnse.train() bnse.compute_moments() Y_freq = bnse.post_mean_r**2 + bnse.post_mean_i**2 Y_freq_err = 2 * np.sqrt(np.diag(bnse.post_cov_r**2 + bnse.post_cov_i**2)) Y_freq = Y_freq[1:] Y_freq_err = Y_freq_err[1:] else: raise ValueError('periodogram method "%s" does not exist' % (method)) ax.plot(X_freq, Y_freq, '-', c='k', lw=2) if len(Y_freq_err) != 0: ax.fill_between(X_freq, Y_freq-Y_freq_err, Y_freq+Y_freq_err, alpha=0.4) ax.set_title((self.name + ' Spectrum' if self.name is not None else '') if title is None else title, fontsize=14) xmin = X_freq.min() xmax = X_freq.max() ax.set_xlim(xmin - (xmax - xmin)*0.005, xmax + (xmax - xmin)*0.005) ax.set_yticks([]) ax.set_ylim(0, None) return ax def _normalize_val(self, val): # normalize input values for X axis, that is: expand to input_dims if a single value, convert values to appropriate dtype if val is None: return val if isinstance(val, np.ndarray): if val.ndim == 0: val = [val.item()] else: val = list(val) elif not isinstance(val, list): val = [val] * self.get_input_dims() if len(val) != self.get_input_dims(): raise ValueError("value must be a scalar or a list of values for each input dimension") if not _is_homogeneous_type(val): raise ValueError("value must have elements of the same type") for i in range(self.get_input_dims()): try: val[i] = self.X[i].dtype.type(val[i]) except: raise ValueError("value must be of type %s" % (self.X[i].dtype,)) return val
Methods
def aggregate(self, duration, f=<function mean>)
-
Aggregate the data by duration and apply a function to obtain a reduced dataset.
For example, group daily data by week and take the mean. The duration can be set as a number which defined the intervals on the X axis, or by a string written in the duration format in case the X axis has data type
numpy.datetime64
. The duration format uses: Y=year, M=month, W=week, D=day, h=hour, m=minute, and s=second. For example, 3W1D means three weeks and one day, ie. 22 days, or 6M to mean six months.Args
duration
:float
,str
- Duration along the X axis or as a string in the duration format.
f
:function
- Function to use to reduce data.
Examples
>>> data.aggregate(5) >>> data.aggregate('2W', f=np.sum)
Expand source code Browse git
def aggregate(self, duration, f=np.mean): """ Aggregate the data by duration and apply a function to obtain a reduced dataset. For example, group daily data by week and take the mean. The duration can be set as a number which defined the intervals on the X axis, or by a string written in the duration format in case the X axis has data type `numpy.datetime64`. The duration format uses: Y=year, M=month, W=week, D=day, h=hour, m=minute, and s=second. For example, 3W1D means three weeks and one day, ie. 22 days, or 6M to mean six months. Args: duration (float, str): Duration along the X axis or as a string in the duration format. f (function): Function to use to reduce data. Examples: >>> data.aggregate(5) >>> data.aggregate('2W', f=np.sum) """ if self.get_input_dims() != 1: raise ValueError("can only aggregate on one dimensional input data") start = np.min(self.X[0]) end = np.max(self.X[0]) step = _parse_delta(duration) X = np.arange(start+step/2, end+step/2, step) Y = np.empty((len(X))) for i in range(len(X)): ind = (self.X[0] >= X[i]-step/2) & (self.X[0] < X[i]+step/2) Y[i] = f(self.Y[ind]) self.X = [Serie(X, self.X[0].transformers)] self.Y = Serie(Y, self.Y.transformers, np.array([x for x in self.X]).T) self.mask = np.array([True] * len(self.X[0]))
def clear_predictions(self)
-
Clear all saved predictions.
Expand source code Browse git
def clear_predictions(self): """ Clear all saved predictions. """ self.Y_mu_pred = {} self.Y_var_pred = {}
def copy(self)
-
Make a deep copy of
Data
.Returns
Data
Examples
>>> other = data.copy()
Expand source code Browse git
def copy(self): """ Make a deep copy of `Data`. Returns: mogptk.data.Data Examples: >>> other = data.copy() """ return copy.deepcopy(self)
def filter(self, start, end)
-
Filter the data range to be between
start
andend
in the X axis.Args
start
:float
,str
- Start of interval.
end
:float
,str
- End of interval.
Examples
>>> data.filter(3, 8) >>> data.filter('2016-01-15', '2016-06-15')
Expand source code Browse git
def filter(self, start, end): """ Filter the data range to be between `start` and `end` in the X axis. Args: start (float, str): Start of interval. end (float, str): End of interval. Examples: >>> data.filter(3, 8) >>> data.filter('2016-01-15', '2016-06-15') """ if self.get_input_dims() != 1: raise ValueError("can only filter on one dimensional input data") start = self._normalize_val(start) end = self._normalize_val(end) ind = (self.X[0] >= start[0]) & (self.X[0] < end[0]) self.X[0] = self.X[0][ind] self.Y = self.Y[ind] self.mask = self.mask[ind]
def get_bnse_estimation(self, Q=1, n=1000)
-
Peak estimation of the spectrum using BNSE (Bayesian Non-parametric Spectral Estimation).
Args
Q
:int
- Number of peaks to find.
n
:int
- Number of points of the grid to evaluate frequencies.
Returns
numpy.ndarray: Amplitude array of shape (Q,input_dims). numpy.ndarray: Frequency array of shape (Q,input_dims). numpy.ndarray: Variance array of shape (Q,input_dims).
Examples
>>> amplitudes, means, variances = data.get_bnse_estimation()
Expand source code Browse git
def get_bnse_estimation(self, Q=1, n=1000): """ Peak estimation of the spectrum using BNSE (Bayesian Non-parametric Spectral Estimation). Args: Q (int): Number of peaks to find. n (int): Number of points of the grid to evaluate frequencies. Returns: numpy.ndarray: Amplitude array of shape (Q,input_dims). numpy.ndarray: Frequency array of shape (Q,input_dims). numpy.ndarray: Variance array of shape (Q,input_dims). Examples: >>> amplitudes, means, variances = data.get_bnse_estimation() """ input_dims = self.get_input_dims() # Gaussian: f(x) = A * exp((x-B)^2 / (2C^2)) # Ie. A is the amplitude or peak height, B the mean or peak position, and C the variance or peak width A = np.zeros((Q, input_dims)) B = np.zeros((Q, input_dims)) C = np.zeros((Q, input_dims)) nyquist = self.get_nyquist_estimation() for i in range(input_dims): x, y = np.array([x.transformed[self.mask] for x in self.X]).T, self.Y.transformed[self.mask] bnse = bse(x[:,i], y) bnse.set_freqspace(nyquist[i], dimension=n) bnse.train() bnse.compute_moments() amplitudes, positions, variances = bnse.get_freq_peaks() if len(positions) == 0: continue if Q < len(amplitudes): amplitudes = amplitudes[:Q] positions = positions[:Q] variances = variances[:Q] num = len(amplitudes) # division by 100 makes it similar to other estimators (emperically found) A[:num,i] = np.sqrt(amplitudes) / 100 B[:num,i] = positions C[:num,i] = variances return A, B, C
def get_data(self, transformed=False)
-
Returns all observations, train and test.
Arguments
transformed
:boolean
- Return transformed data.
Returns
numpy.ndarray: X data of shape (n,input_dims). numpy.ndarray: Y data of shape (n,).
Examples
>>> x, y = data.get_data()
Expand source code Browse git
def get_data(self, transformed=False): """ Returns all observations, train and test. Arguments: transformed (boolean): Return transformed data. Returns: numpy.ndarray: X data of shape (n,input_dims). numpy.ndarray: Y data of shape (n,). Examples: >>> x, y = data.get_data() """ if transformed: return np.array([x for x in self.X]).T, self.Y.transformed return np.array([x for x in self.X]).T, np.array(self.Y)
def get_input_dims(self)
-
Returns the number of input dimensions.
Returns
int
- Number of input dimensions.
Examples
>>> data.get_input_dims() **`2`**
Expand source code Browse git
def get_input_dims(self): """ Returns the number of input dimensions. Returns: int: Number of input dimensions. Examples: >>> data.get_input_dims() 2 """ return len(self.X)
def get_lombscargle_estimation(self, Q=1, n=10000)
-
Peak estimation of the spectrum using Lomb-Scargle.
Args
Q
:int
- Number of peaks to find.
n
:int
- Number of points to use for Lomb-Scargle.
Returns
numpy.ndarray: Amplitude array of shape (Q,input_dims). numpy.ndarray: Frequency array of shape (Q,input_dims). numpy.ndarray: Variance array of shape (Q,input_dims).
Examples
>>> amplitudes, means, variances = data.get_lombscargle_estimation()
Expand source code Browse git
def get_lombscargle_estimation(self, Q=1, n=10000): """ Peak estimation of the spectrum using Lomb-Scargle. Args: Q (int): Number of peaks to find. n (int): Number of points to use for Lomb-Scargle. Returns: numpy.ndarray: Amplitude array of shape (Q,input_dims). numpy.ndarray: Frequency array of shape (Q,input_dims). numpy.ndarray: Variance array of shape (Q,input_dims). Examples: >>> amplitudes, means, variances = data.get_lombscargle_estimation() """ input_dims = self.get_input_dims() # Gaussian: f(x) = A * exp((x-B)^2 / (2C^2)) # i.e. A is the amplitude or peak height, B the mean or peak position, and C the std.dev. or peak width A = np.zeros((Q, input_dims)) B = np.zeros((Q, input_dims)) C = np.zeros((Q, input_dims)) nyquist = self.get_nyquist_estimation() for i in range(input_dims): x, y = np.array([x.transformed[self.mask] for x in self.X]).T, self.Y.transformed[self.mask] freq = np.linspace(0, nyquist[i], n+1)[1:] psd = signal.lombscargle(x[:,i]*2.0*np.pi, y, freq) ind, _ = signal.find_peaks(psd) ind = ind[np.argsort(psd[ind])[::-1]] # sort by biggest peak first widths, width_heights, _, _ = signal.peak_widths(psd, ind, rel_height=0.5) widths *= freq[1]-freq[0] positions = freq[ind] amplitudes = psd[ind] # from full-width half-maximum to Gaussian sigma # note that amplitudes / width_heights is near 2 when the base of the peak is near zero stddevs = widths / np.sqrt(8 * np.log(amplitudes / width_heights)) if Q < len(amplitudes): amplitudes = amplitudes[:Q] positions = positions[:Q] stddevs = stddevs[:Q] n = len(amplitudes) A[:n,i] = np.sqrt(amplitudes) B[:n,i] = positions C[:n,i] = stddevs return A, B, C
def get_name(self)
-
Return the name of the channel.
Returns
str
Examples
>>> data.get_name() 'A'
Expand source code Browse git
def get_name(self): """ Return the name of the channel. Returns: str Examples: >>> data.get_name() 'A' """ return self.name
def get_nyquist_estimation(self)
-
Estimate the Nyquist frequency by taking 0.5/(minimum distance of points).
Returns
numpy.ndarray: Nyquist frequency array of shape (input_dims,).
Examples
>>> freqs = data.get_nyquist_estimation()
Expand source code Browse git
def get_nyquist_estimation(self): """ Estimate the Nyquist frequency by taking 0.5/(minimum distance of points). Returns: numpy.ndarray: Nyquist frequency array of shape (input_dims,). Examples: >>> freqs = data.get_nyquist_estimation() """ input_dims = self.get_input_dims() nyquist = np.empty((input_dims)) for i in range(self.get_input_dims()): x = np.sort(self.X[i].transformed[self.mask]) dist = np.abs(x[1:]-x[:-1]) dist = np.min(dist[np.nonzero(dist)]) nyquist[i] = 0.5/dist return nyquist
def get_prediction(self, name, sigma=2.0, transformed=False)
-
Returns the prediction of a given name with a confidence interval of
sigma
times the standard deviation.Args
name
:str
- Name of the model of the prediction.
sigma
:float
- The confidence interval's number of standard deviations.
transformed
:boolean
- Return transformed data as used for training.
Returns
numpy.ndarray: X prediction of shape (n,input_dims). numpy.ndarray: Y mean prediction of shape (n,). numpy.ndarray: Y lower prediction of uncertainty interval of shape (n,). numpy.ndarray: Y upper prediction of uncertainty interval of shape (n,).
Examples
>>> x, y_mean, y_var_lower, y_var_upper = data.get_prediction('MOSM', sigma=1)
Expand source code Browse git
def get_prediction(self, name, sigma=2.0, transformed=False): """ Returns the prediction of a given name with a confidence interval of `sigma` times the standard deviation. Args: name (str): Name of the model of the prediction. sigma (float): The confidence interval's number of standard deviations. transformed (boolean): Return transformed data as used for training. Returns: numpy.ndarray: X prediction of shape (n,input_dims). numpy.ndarray: Y mean prediction of shape (n,). numpy.ndarray: Y lower prediction of uncertainty interval of shape (n,). numpy.ndarray: Y upper prediction of uncertainty interval of shape (n,). Examples: >>> x, y_mean, y_var_lower, y_var_upper = data.get_prediction('MOSM', sigma=1) """ if name not in self.Y_mu_pred: raise ValueError("prediction name '%s' does not exist" % (name)) X = np.array([x for x in self.X_pred]).T mu = self.Y_mu_pred[name] lower = mu - sigma * np.sqrt(self.Y_var_pred[name]) upper = mu + sigma * np.sqrt(self.Y_var_pred[name]) if transformed: return X, mu, lower, upper mu = Serie(self.Y.detransform(mu, X), self.Y.transformers, transformed=mu) lower = Serie(self.Y.detransform(lower, X), self.Y.transformers, transformed=lower) upper = Serie(self.Y.detransform(upper, X), self.Y.transformers, transformed=upper) return X, mu, lower, upper
def get_prediction_names(self)
-
Returns the model names of the saved predictions.
Returns
list
- List of prediction names.
Examples
>>> data.get_prediction_names() ['MOSM', 'CSM', 'SM-LMC', 'CONV']
Expand source code Browse git
def get_prediction_names(self): """ Returns the model names of the saved predictions. Returns: list: List of prediction names. Examples: >>> data.get_prediction_names() ['MOSM', 'CSM', 'SM-LMC', 'CONV'] """ return self.Y_mu_pred.keys()
def get_prediction_x(self)
-
Returns the prediction X range.
Returns
numpy.ndarray: X prediction of shape (n,input_dims).
Examples
>>> x = data.get_prediction_x()
Expand source code Browse git
def get_prediction_x(self): """ Returns the prediction X range. Returns: numpy.ndarray: X prediction of shape (n,input_dims). Examples: >>> x = data.get_prediction_x() """ return np.array([x for x in self.X_pred]).T
def get_sm_estimation(self, Q=1, method='BNSE', optimizer='Adam', iters=100, params={}, plot=False)
-
Peak estimation of the spectrum using the spectral mixture kernel.
Args
Q
:int
- Number of peaks to find.
method
:str
- Method of estimating SM kernels.
optimizer
:str
- Optimization method for SM kernels.
iters
:str
- Maximum iteration for SM kernels.
params
:object
- Additional parameters for the PyTorch optimizer.
plot
:bool
- Show the PSD of the kernel after fitting.
Returns
numpy.ndarray: Amplitude array of shape (Q,input_dims). numpy.ndarray: Frequency array of shape (Q,input_dims). numpy.ndarray: Variance array of shape (Q,input_dims).
Examples
>>> amplitudes, means, variances = data.get_sm_estimation()
Expand source code Browse git
def get_sm_estimation(self, Q=1, method='BNSE', optimizer='Adam', iters=100, params={}, plot=False): """ Peak estimation of the spectrum using the spectral mixture kernel. Args: Q (int): Number of peaks to find. method (str): Method of estimating SM kernels. optimizer (str): Optimization method for SM kernels. iters (str): Maximum iteration for SM kernels. params (object): Additional parameters for the PyTorch optimizer. plot (bool): Show the PSD of the kernel after fitting. Returns: numpy.ndarray: Amplitude array of shape (Q,input_dims). numpy.ndarray: Frequency array of shape (Q,input_dims). numpy.ndarray: Variance array of shape (Q,input_dims). Examples: >>> amplitudes, means, variances = data.get_sm_estimation() """ from .sm import SM input_dims = self.get_input_dims() # Gaussian: f(x) = A * exp((x-B)^2 / (2C^2)) # Ie. A is the amplitude or peak height, B the mean or peak position, and C the variance or peak width A = np.zeros((Q, input_dims)) B = np.zeros((Q, input_dims)) C = np.zeros((Q, input_dims)) sm = SM(self, Q) sm.init_parameters(method) sm.train(method=optimizer, iters=iters, **params) if plot: nyquist = self.get_nyquist_estimation() means = np.array([sm.model.kernel[0][q].mean() for q in range(Q)]) weights = np.array([sm.model.kernel[0][q].weight() for q in range(Q)]) scales = np.array([sm.model.kernel[0][q].variance() for q in range(Q)]) plot_spectrum(means, scales, weights=weights, nyquist=nyquist, title=self.name) for q in range(Q): A[q,:] = sm.kernel[0][q].weight.numpy() # TODO: weight is not per input_dims B[q,:] = sm.kernel[0][q].mean.numpy() C[q,:] = sm.kernel[0][q].variance.numpy() return A, B, C
def get_test_data(self, transformed=False)
-
Returns the observations used for testing which correspond to the removed points.
Arguments
transformed
:boolean
- Return transformed data.
Returns
numpy.ndarray: X data of shape (n,input_dims). numpy.ndarray: Y data of shape (n,).
Examples
>>> x, y = data.get_test_data()
Expand source code Browse git
def get_test_data(self, transformed=False): """ Returns the observations used for testing which correspond to the removed points. Arguments: transformed (boolean): Return transformed data. Returns: numpy.ndarray: X data of shape (n,input_dims). numpy.ndarray: Y data of shape (n,). Examples: >>> x, y = data.get_test_data() """ X = np.array([x[~self.mask] for x in self.X]).T if self.F is not None: if X.shape[0] == 0: X, _ = self.get_data() Y = self.F(X) if transformed: Y = self.Y.transform(Y, X) return X, Y if transformed: return X, self.Y.transformed[~self.mask] return X, np.array(self.Y[~self.mask])
def get_train_data(self, transformed=False)
-
Returns the observations used for training.
Arguments
transformed
:boolean
- Return transformed data.
Returns
numpy.ndarray: X data of shape (n,input_dims). numpy.ndarray: Y data of shape (n,).
Examples
>>> x, y = data.get_train_data()
Expand source code Browse git
def get_train_data(self, transformed=False): """ Returns the observations used for training. Arguments: transformed (boolean): Return transformed data. Returns: numpy.ndarray: X data of shape (n,input_dims). numpy.ndarray: Y data of shape (n,). Examples: >>> x, y = data.get_train_data() """ if transformed: return np.array([x[self.mask] for x in self.X]).T, self.Y.transformed[self.mask] return np.array([x[self.mask] for x in self.X]).T, np.array(self.Y[self.mask])
def has_test_data(self)
-
Returns True if observations have been removed using the
remove_*
methods.Returns
boolean
Examples
>>> data.has_test_data() **`True`**
Expand source code Browse git
def has_test_data(self): """ Returns True if observations have been removed using the `remove_*` methods. Returns: boolean Examples: >>> data.has_test_data() True """ return False in self.mask
def plot(self, pred=None, title=None, ax=None, legend=True, transformed=False)
-
Plot the data including removed observations, latent function, and predictions.
Args
pred
:str
- Specify model name to draw.
title
:str
- Set the title of the plot.
ax
:matplotlib.axes.Axes
- Draw to this axes, otherwise draw to the current axes.
legend
:boolean
- Display legend.
transformed
:boolean
- Display transformed Y data as used for training.
Returns
matplotlib.figure.Figure: only if
ax
is not set matplotlib.axes.Axes: only ifax
is not setExamples
>>> ax = data.plot()
Expand source code Browse git
def plot(self, pred=None, title=None, ax=None, legend=True, transformed=False): """ Plot the data including removed observations, latent function, and predictions. Args: pred (str): Specify model name to draw. title (str): Set the title of the plot. ax (matplotlib.axes.Axes): Draw to this axes, otherwise draw to the current axes. legend (boolean): Display legend. transformed (boolean): Display transformed Y data as used for training. Returns: matplotlib.figure.Figure: only if `ax` is not set matplotlib.axes.Axes: only if `ax` is not set Examples: >>> ax = data.plot() """ # TODO: ability to plot conditional or marginal distribution to reduce input dims if self.get_input_dims() > 2: raise ValueError("cannot plot more than two input dimensions") if self.get_input_dims() == 2: raise NotImplementedError("two dimensional input data not yet implemented") # TODO fig = None if ax is None: fig, ax = plt.subplots(1, 1, figsize=(12, 3.0), squeeze=True, constrained_layout=True) legends = [] colors = list(matplotlib.colors.TABLEAU_COLORS) for i, name in enumerate(self.Y_mu_pred): if self.Y_mu_pred[name].size != 0 and (pred is None or name.lower() == pred.lower()): X_pred, mu, lower, upper = self.get_prediction(name, transformed=transformed) idx = np.argsort(X_pred[:,0]) ax.plot(X_pred[:,0][idx], mu[idx], ls='-', color=colors[i], lw=2) ax.fill_between(X_pred[:,0][idx], lower[idx], upper[idx], color=colors[i], alpha=0.1) #ax.plot(X_pred[:,0][idx], lower[idx], ls='-', color=colors[i], lw=1, alpha=0.5) #ax.plot(X_pred[:,0][idx], upper[idx], ls='-', color=colors[i], lw=1, alpha=0.5) label = 'Prediction' if name is not None: label += ' ' + name legends.append(plt.Line2D([0], [0], ls='-', color=colors[i], lw=2, label=label)) if self.F is not None: xmin = min(np.min(self.X[0]), np.min(self.X_pred[0])) xmax = max(np.max(self.X[0]), np.max(self.X_pred[0])) if np.issubdtype(self.X[0].dtype, np.datetime64): dt = np.timedelta64(1,self.X[0].get_time_unit()) n = int((xmax-xmin) / dt) + 1 x = np.empty((n, 1), dtype=self.X[0].dtype) x[:,0] = np.arange(xmin, xmax+np.timedelta64(1,'us'), dt) else: n = len(self.X[0])*10 x = np.empty((n, 1)) x[:,0] = np.linspace(xmin, xmax, n) y = self.F(x) if transformed: y = self.Y.transform(y, x) ax.plot(x[:,0], y, 'r--', lw=1) legends.append(plt.Line2D([0], [0], ls='--', color='r', label='True')) _, Y = self.get_data(transformed=transformed) idx = np.argsort(self.X[0]) ax.plot(self.X[0][idx], Y[idx], 'k--', alpha=0.8) legends.append(plt.Line2D([0], [0], ls='--', color='k', label='All Points')) x, y = self.get_train_data(transformed=transformed) if 1000 < x.shape[0]: ax.plot(x[:,0], y, 'k-') legends.append(plt.Line2D([0], [0], ls='-', color='k', label='Training Points')) else: ax.plot(x[:,0], y, 'k.', mew=1, ms=13, markeredgecolor='white') legends.append(plt.Line2D([0], [0], ls='', color='k', marker='.', ms=10, label='Training Points')) if self.has_test_data(): for removed_range in self.removed_ranges[0]: x0 = removed_range[0] x1 = removed_range[1] y0 = ax.get_ylim()[0] y1 = ax.get_ylim()[1] ax.add_patch(patches.Rectangle( (x0, y0), x1-x0, y1-y0, fill=True, color='xkcd:strawberry', alpha=0.2, lw=0, )) legends.append(patches.Rectangle( (1, 1), 1, 1, fill=True, color='xkcd:strawberry', alpha=0.5, lw=0, label='Removed Ranges' )) xmin = min(self.X[0].min(), self.X_pred[0].min()) xmax = max(self.X[0].max(), self.X_pred[0].max()) ax.set_xlim(xmin - (xmax - xmin)*0.001, xmax + (xmax - xmin)*0.001) ax.set_xlabel(self.X_labels[0]) ax.set_ylabel(self.Y_label) ax.set_title(self.name if title is None else title, fontsize=14) if legend: legend_rows = (len(legends)-1)/5 + 1 ax.legend(handles=legends, loc="upper center", bbox_to_anchor=(0.5,(3.0+0.5+0.3*legend_rows)/3.0), ncol=5) if fig is not None: return fig, ax
def plot_spectrum(self, title=None, method='ls', ax=None, per=None, maxfreq=None, transformed=False)
-
Plot the spectrum of the data.
Args
title
:str
- Set the title of the plot.
method
:str
- Set the method to get the spectrum such as LS or BNSE.
ax
:matplotlib.axes.Axes
- Draw to this axes, otherwise draw to the current axes.
per
:str
,float
,numpy.timedelta64
- Set the scale of the X axis depending on the formatter used, eg. per=5, per='day', or per='3D'.
maxfreq
:float
- Maximum frequency to plot, otherwise the Nyquist frequency is used.
transformed
:boolean
- Display transformed Y data as used for training.
Returns
matplotlib.axes.Axes
Examples
>>> ax = data.plot_spectrum(method='bnse')
Expand source code Browse git
def plot_spectrum(self, title=None, method='ls', ax=None, per=None, maxfreq=None, transformed=False): """ Plot the spectrum of the data. Args: title (str): Set the title of the plot. method (str): Set the method to get the spectrum such as LS or BNSE. ax (matplotlib.axes.Axes): Draw to this axes, otherwise draw to the current axes. per (str, float, numpy.timedelta64): Set the scale of the X axis depending on the formatter used, eg. per=5, per='day', or per='3D'. maxfreq (float): Maximum frequency to plot, otherwise the Nyquist frequency is used. transformed (boolean): Display transformed Y data as used for training. Returns: matplotlib.axes.Axes Examples: >>> ax = data.plot_spectrum(method='bnse') """ # TODO: ability to plot conditional or marginal distribution to reduce input dims if self.get_input_dims() > 2: raise ValueError("cannot plot more than two input dimensions") if self.get_input_dims() == 2: raise NotImplementedError("two dimensional input data not yet implemented") # TODO if ax is None: _, ax = plt.subplots(1, 1, figsize=(12, 3.0), squeeze=True, constrained_layout=True) X_scale = 1.0 if np.issubdtype(self.X[0].dtype, np.datetime64): if per is None: per = _datetime64_unit_names[self.X[0].get_time_unit()] else: unit = _parse_delta(per) X_scale = np.timedelta64(1,self.X[0].get_time_unit()) / unit if not isinstance(per, str): per = '%s' % (unit,) if per is not None: ax.set_xlabel('Frequency [1/'+per+']') else: ax.set_xlabel('Frequency') X = self.X[0].astype(np.float) Y = self.Y if transformed: Y = self.Y.transformed idx = np.argsort(X) X = X[idx] * X_scale Y = Y[idx] nyquist = maxfreq if nyquist is None: dist = np.abs(X[1:]-X[:-1]) nyquist = 0.5 / np.average(dist) X_freq = np.linspace(0.0, nyquist, 10001)[1:] Y_freq_err = [] if method.lower() == 'ls': Y_freq = signal.lombscargle(X*2.0*np.pi, Y, X_freq) elif method.lower() == 'bnse': bnse = bse(X, Y) bnse.set_freqspace(nyquist, 10001) bnse.train() bnse.compute_moments() Y_freq = bnse.post_mean_r**2 + bnse.post_mean_i**2 Y_freq_err = 2 * np.sqrt(np.diag(bnse.post_cov_r**2 + bnse.post_cov_i**2)) Y_freq = Y_freq[1:] Y_freq_err = Y_freq_err[1:] else: raise ValueError('periodogram method "%s" does not exist' % (method)) ax.plot(X_freq, Y_freq, '-', c='k', lw=2) if len(Y_freq_err) != 0: ax.fill_between(X_freq, Y_freq-Y_freq_err, Y_freq+Y_freq_err, alpha=0.4) ax.set_title((self.name + ' Spectrum' if self.name is not None else '') if title is None else title, fontsize=14) xmin = X_freq.min() xmax = X_freq.max() ax.set_xlim(xmin - (xmax - xmin)*0.005, xmax + (xmax - xmin)*0.005) ax.set_yticks([]) ax.set_ylim(0, None) return ax
def remove_index(self, index)
-
Removes observations of given index
Args
index(list, numpy.ndarray): Array of indexes of the data to remove.
Expand source code Browse git
def remove_index(self, index): """ Removes observations of given index Args: index(list, numpy.ndarray): Array of indexes of the data to remove. """ if isinstance(index, list): index = np.array(index) elif not isinstance(index, np.ndarray): raise ValueError("index must be list or numpy array") self.mask[index] = False
def remove_random_ranges(self, n, duration)
-
Removes a number of ranges to simulate sensor failure. May remove fewer ranges if there is no more room to remove a range in the remaining data.
Args
n
:int
- Number of ranges to remove.
duration
:float
,str
- Width of ranges to remove, can use a number or the duration format syntax (see aggregate()).
Examples
>>> data.remove_random_ranges(2, 5) # remove two ranges that are 5 wide in input space >>> data.remove_random_ranges(3, '1d') # remove three ranges that are 1 day wide
Expand source code Browse git
def remove_random_ranges(self, n, duration): """ Removes a number of ranges to simulate sensor failure. May remove fewer ranges if there is no more room to remove a range in the remaining data. Args: n (int): Number of ranges to remove. duration (float, str): Width of ranges to remove, can use a number or the duration format syntax (see aggregate()). Examples: >>> data.remove_random_ranges(2, 5) # remove two ranges that are 5 wide in input space >>> data.remove_random_ranges(3, '1d') # remove three ranges that are 1 day wide """ if self.get_input_dims() != 1: raise ValueError("can only remove ranges on one dimensional input data") if n < 1: return delta = _parse_delta(duration) m = (np.max(self.X[0])-np.min(self.X[0])) - n*delta if m <= 0: raise ValueError("no data left after removing ranges") locs = self.X[0] <= (np.max(self.X[0])-delta) locs[sum(locs)] = True # make sure the last data point can be deleted for i in range(n): if len(self.X[0][locs]) == 0: break # range could not be removed, there is no remaining data range of width delta x = self.X[0][locs][np.random.randint(len(self.X[0][locs]))] locs[(self.X[0] > x-delta) & (self.X[0] < x+delta)] = False self.mask[(self.X[0] >= x) & (self.X[0] < x+delta)] = False self.removed_ranges[0].append([x, x+delta])
def remove_randomly(self, n=None, pct=None)
-
Removes observations randomly on the whole range. Either
n
observations are removed, or a percentage of the observations.Args
n
:int
- Number of observations to remove randomly.
pct
:float
- Percentage in interval [0,1] of observations to remove randomly.
Examples
>>> data.remove_randomly(50) # remove 50 observations >>> data.remove_randomly(pct=0.9) # remove 90% of the observations
Expand source code Browse git
def remove_randomly(self, n=None, pct=None): """ Removes observations randomly on the whole range. Either `n` observations are removed, or a percentage of the observations. Args: n (int): Number of observations to remove randomly. pct (float): Percentage in interval [0,1] of observations to remove randomly. Examples: >>> data.remove_randomly(50) # remove 50 observations >>> data.remove_randomly(pct=0.9) # remove 90% of the observations """ if n is None: if pct is None: n = 0 else: n = int(pct * len(self.Y)) idx = np.random.choice(len(self.Y), n, replace=False) self.mask[idx] = False
def remove_range(self, start=None, end=None)
-
Removes observations in the interval
[start,end]
.Args
start
:float
,str
- Start of interval. Defaults to the first value in observations.
end
:float
,str
- End of interval. Defaults to the last value in observations.
Examples
>>> data = mogptk.LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave') >>> data.remove_range(3, 8) >>> data = mogptk.LoadCSV('gold.csv', 'Date', 'Price') >>> data.remove_range('2016-01-15', '2016-06-15')
Expand source code Browse git
def remove_range(self, start=None, end=None): """ Removes observations in the interval `[start,end]`. Args: start (float, str): Start of interval. Defaults to the first value in observations. end (float, str): End of interval. Defaults to the last value in observations. Examples: >>> data = mogptk.LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave') >>> data.remove_range(3, 8) >>> data = mogptk.LoadCSV('gold.csv', 'Date', 'Price') >>> data.remove_range('2016-01-15', '2016-06-15') """ if self.get_input_dims() != 1: raise ValueError("can only remove ranges on one dimensional input data") if start is None: start = np.min(self.X[0]) if end is None: end = np.max(self.X[0]) start = self._normalize_val(start) end = self._normalize_val(end) idx = np.where(np.logical_and(self.X[0] >= start[0], self.X[0] <= end[0])) self.mask[idx] = False self.removed_ranges[0].append([start[0], end[0]])
def remove_relative_range(self, start=0.0, end=1.0)
-
Removes observations between
start
andend
as a percentage of the number of observations. So0
is the first observation,0.5
is the middle observation, and1
is the last observation.Args
start
:float
- Start percentage in interval [0,1].
end
:float
- End percentage in interval [0,1].
Expand source code Browse git
def remove_relative_range(self, start=0.0, end=1.0): """ Removes observations between `start` and `end` as a percentage of the number of observations. So `0` is the first observation, `0.5` is the middle observation, and `1` is the last observation. Args: start (float): Start percentage in interval [0,1]. end (float): End percentage in interval [0,1]. """ if self.get_input_dims() != 1: raise ValueError("can only remove ranges on one dimensional input data") start = self._normalize_val(start) end = self._normalize_val(end) x_min = np.min(self.X[0]) x_max = np.max(self.X[0]) for i in range(self.get_input_dims()): start[i] = x_min + max(0.0, min(1.0, start[i])) * (x_max-x_min) end[i] = x_min + max(0.0, min(1.0, end[i])) * (x_max-x_min) idx = np.where(np.logical_and(self.X[0] >= start[0], self.X[0] <= end[0])) self.mask[idx] = False self.removed_ranges[0].append([start[0], end[0]])
def rescale_x(self, upper=1000.0)
-
Rescale the X axis so that it is in the interval [0.0,upper]. This helps training most kernels.
Args
upper
:float
- Upper end of the interval.
Examples
>>> data.rescale_x()
Expand source code Browse git
def rescale_x(self, upper=1000.0): """ Rescale the X axis so that it is in the interval [0.0,upper]. This helps training most kernels. Args: upper (float): Upper end of the interval. Examples: >>> data.rescale_x() """ for i in range(self.get_input_dims()): X = self.X[i].transformed xmin = np.min(X) xmax = np.max(X) t = TransformLinear(xmin, (xmax-xmin)/upper) t.set_data(X) self.X[i].apply(t)
def reset(self)
-
Reset the data set and undo the removal of data points. That is, this reverts any calls to
remove_randomly
,remove_range
,remove_relative_range
,remove_random_ranges
, andremove_index
.Expand source code Browse git
def reset(self): """ Reset the data set and undo the removal of data points. That is, this reverts any calls to `remove_randomly`, `remove_range`, `remove_relative_range`, `remove_random_ranges`, and `remove_index`. """ self.mask[:] = True for i in range(len(self.removed_ranges)): self.removed_ranges[i] = []
def set_function(self, f)
-
Set the (latent) function for the data, ie. the theoretical or true signal. This is used for plotting purposes and is optional.
The function should take one argument X of shape (n,input_dims) and return Y of shape (n,). If your data has only one input dimension, you can use X[:,0] to select the first (and only) input dimension.
Args
f
:function
- Function taking X with shape (n,input_dims) and returning shape (n,) as Y.
Examples
>>> data.set_function(lambda x: np.sin(3*x[:,0])
Expand source code Browse git
def set_function(self, f): """ Set the (latent) function for the data, ie. the theoretical or true signal. This is used for plotting purposes and is optional. The function should take one argument X of shape (n,input_dims) and return Y of shape (n,). If your data has only one input dimension, you can use X[:,0] to select the first (and only) input dimension. Args: f (function): Function taking X with shape (n,input_dims) and returning shape (n,) as Y. Examples: >>> data.set_function(lambda x: np.sin(3*x[:,0]) """ _check_function(f, self.get_input_dims(), self.X[0].is_datetime64()) self.F = f
def set_labels(self, x_labels, y_label)
-
Set axis labels for plots.
Args
x_labels
:str
,list
ofstr
- X data names for each input dimension.
y_label
:str
- Y data name for output dimension.
Examples
>>> data.set_labels(['X', 'Y'], 'Cd')
Expand source code Browse git
def set_labels(self, x_labels, y_label): """ Set axis labels for plots. Args: x_labels (str, list of str): X data names for each input dimension. y_label (str): Y data name for output dimension. Examples: >>> data.set_labels(['X', 'Y'], 'Cd') """ if isinstance(x_labels, str): x_labels = [x_labels] elif not isinstance(x_labels, list) or not all(isinstance(item, str) for item in x_labels): raise ValueError("x_labels must be list of strings") if not isinstance(y_label, str): raise ValueError("y_label must be string") if len(x_labels) != self.get_input_dims(): raise ValueError("x_labels must have the same input dimensions as the data") self.X_labels = x_labels self.Y_label = y_label
def set_name(self, name)
-
Set name for data channel.
Args
name
:str
- Name of data.
Examples
>>> data.set_name('Channel A')
Expand source code Browse git
def set_name(self, name): """ Set name for data channel. Args: name (str): Name of data. Examples: >>> data.set_name('Channel A') """ self.name = name
def set_prediction_range(self, start=None, end=None, n=None, step=None)
-
Sets the prediction range. The interval is set as
[start,end]
, with eithern
points or a given step between the points.Args
start
:float
,str
- Start of interval, defaults to the first observation.
end
:float
,str
- End of interval, defaults to the last observation.
n
:int
- Number of points to generate in the interval.
step
:float
,str
- Spacing between points in the interval.
If neither step or n is passed, default number of points is 100.
Examples
>>> data = mogptk.LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave') >>> data.set_prediction_range(3, 8, 200) >>> data = mogptk.LoadCSV('gold.csv', 'Date', 'Price') >>> data.set_prediction_range('2016-01-15', '2016-06-15', step='1d')
Expand source code Browse git
def set_prediction_range(self, start=None, end=None, n=None, step=None): """ Sets the prediction range. The interval is set as `[start,end]`, with either `n` points or a given step between the points. Args: start (float, str): Start of interval, defaults to the first observation. end (float, str): End of interval, defaults to the last observation. n (int): Number of points to generate in the interval. step (float, str): Spacing between points in the interval. If neither step or n is passed, default number of points is 100. Examples: >>> data = mogptk.LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave') >>> data.set_prediction_range(3, 8, 200) >>> data = mogptk.LoadCSV('gold.csv', 'Date', 'Price') >>> data.set_prediction_range('2016-01-15', '2016-06-15', step='1d') """ # TODO: accept multiple input dims and make sure dtype equals X if self.get_input_dims() != 1: raise ValueError("can only set prediction range on one dimensional input data") if start is None: start = [x[0] for x in self.X] if end is None: start = [x[-1] for x in self.X] start = self._normalize_val(start) end = self._normalize_val(end) # TODO: works for multi input dims? if end <= start: raise ValueError("start must be lower than end") # TODO: prediction range for multi input dimension; fix other axes to zero so we can plot? X_pred = [np.array([])] * self.get_input_dims() if step is None and n is not None: for i in range(self.get_input_dims()): X_pred[i] = start[i] + (end[i]-start[i])*np.linspace(0.0, 1.0, n) else: if self.get_input_dims() != 1: raise ValueError("cannot use step for multi dimensional input, use n") if step is None: step = (end[0]-start[0])/100 else: step = _parse_delta(step) X_pred[0] = np.arange(start[0], end[0]+step, step) self.X_pred = [Serie(x, self.X[i].transformers) for i, x in enumerate(X_pred)] # clear old prediction data now that X_pred has been updated self.clear_predictions()
def set_prediction_x(self, X)
-
Set the prediction range directly for saved predictions. This will clear old predictions.
Args
X
:list
,numpy.ndarray
- Array of shape (n,) or (n,input_dims) used for predictions.
Examples
>>> data.set_prediction_x([5.0, 5.5, 6.0, 6.5, 7.0])
Expand source code Browse git
def set_prediction_x(self, X): """ Set the prediction range directly for saved predictions. This will clear old predictions. Args: X (list, numpy.ndarray): Array of shape (n,) or (n,input_dims) used for predictions. Examples: >>> data.set_prediction_x([5.0, 5.5, 6.0, 6.5, 7.0]) """ # TODO: accept multiple input dims and make sure dtype equals X if isinstance(X, list): X = np.array(X) elif not isinstance(X, np.ndarray): raise ValueError("X expected to be a list or numpy.ndarray") X = X.astype(np.float64) if X.ndim == 1: X = X.reshape(-1, 1) if X.ndim != 2 or X.shape[1] != self.get_input_dims(): raise ValueError("X shape must be (n,input_dims)") self.X_pred = [Serie(X[:,i], self.X[i].transformers) for i in range(self.get_input_dims())] # clear old prediction data now that X_pred has been updated self.clear_predictions()
def transform(self, transformer)
-
Transform the Y axis data by using one of the provided transformers, such as
TransformDetrend
,TransformLinear
,TransformLog
,TransformNormalize
,TransformStandard
, etc.Args
transformer
:obj
- Transformer object derived from TransformBase.
Examples
>>> data.transform(mogptk.TransformDetrend(degree=2)) # remove polynomial trend >>> data.transform(mogptk.TransformLinear(slope=1, bias=2)) # remove linear trend >>> data.transform(mogptk.TransformLog) # log transform the data >>> data.transform(mogptk.TransformNormalize) # transform to [-1,1] >>> data.transform(mogptk.TransformStandard) # transform to mean=0, var=1
Expand source code Browse git
def transform(self, transformer): """ Transform the Y axis data by using one of the provided transformers, such as `TransformDetrend`, `TransformLinear`, `TransformLog`, `TransformNormalize`, `TransformStandard`, etc. Args: transformer (obj): Transformer object derived from TransformBase. Examples: >>> data.transform(mogptk.TransformDetrend(degree=2)) # remove polynomial trend >>> data.transform(mogptk.TransformLinear(slope=1, bias=2)) # remove linear trend >>> data.transform(mogptk.TransformLog) # log transform the data >>> data.transform(mogptk.TransformNormalize) # transform to [-1,1] >>> data.transform(mogptk.TransformStandard) # transform to mean=0, var=1 """ t = transformer if isinstance(t, type): t = transformer() else: t = copy.deepcopy(t) t.set_data(self) self.Y.apply(t, np.array([x for x in self.X]).T)