Module mogptk.data
Expand source code Browse git
import csv
import copy
import inspect
import numpy as np
from .bnse import *
from scipy import signal
import dateutil, datetime
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import pandas as pd
import re
import logging
logger = logging.getLogger('mogptk')
class FormatBase:
def parse(self, val):
raise NotImplementedError
def parse_delta(self, val):
raise NotImplementedError
def format(self, val):
raise NotImplementedError
def get_scale(self, maxfreq=None):
raise NotImplementedError
class FormatNumber(FormatBase):
"""
FormatNumber is the default formatter and takes regular floating point values as input.
"""
def __init__(self):
self.category = 'num'
def parse(self, val):
if np.isnan(val):
raise ValueError("number cannot be NaN")
return float(val)
def parse_delta(self, val):
return self.parse(val)
def format(self, val):
return '%.6g' % (val,)
def get_scale(self, maxfreq=None):
return 1, None
class FormatDate(FormatBase):
"""
FormatDate is a formatter that takes date values as input, such as '2019-03-01', and stores values internally as days since 1970-01-01.
"""
def __init__(self):
self.category = 'date'
def parse(self, val):
if isinstance(val, np.datetime64):
dt = pd.Timestamp(val).to_pydatetime()
else:
dt = dateutil.parser.parse(val)
return (dt - datetime.datetime(1970,1,1)).total_seconds()/3600/24
def parse_delta(self, val):
if isinstance(val, int):
return val
if isinstance(val, str):
return _parse_duration_to_sec(val)/24/3600
raise ValueError("could not convert input to duration")
def format(self, val):
return datetime.datetime.utcfromtimestamp(val*3600*24).strftime('%Y-%m-%d')
def get_scale(self, maxfreq=None):
if maxfreq == 'year':
return 356.2425, 'year'
if maxfreq == 'month':
return 30.4369, 'month'
if maxfreq == None or maxfreq == 'day':
return 1, 'day'
if maxfreq == 'hour':
return 1/24, 'hour'
if maxfreq == 'minute':
return 1/24/60, 'minute'
if maxfreq == 'second':
return 1/24/3600, 'second'
class FormatDateTime(FormatBase):
"""
FormatDateTime is a formatter that takes date and time values as input, such as '2019-03-01 12:30', and stores values internally as seconds since 1970-01-01.
"""
def __init__(self):
self.category = 'date'
def parse(self, val):
if isinstance(val, np.datetime64):
dt = pd.Timestamp(val).to_pydatetime()
else:
dt = dateutil.parser.parse(val)
return (dt - datetime.datetime(1970,1,1)).total_seconds()
def parse_delta(self, val):
if isinstance(val, int):
return val
if isinstance(val, str):
return _parse_duration_to_sec(val)
raise ValueError("could not convert input to duration")
def format(self, val):
return datetime.datetime.utcfromtimestamp(val).strftime('%Y-%m-%d %H:%M')
def get_scale(self, maxfreq=None):
if maxfreq == 'year':
return 3600*24*356.2425, 'year'
if maxfreq == 'month':
return 3600*24*30.4369, 'month'
if maxfreq == 'day':
return 3600*24, 'day'
if maxfreq == 'hour':
return 3600, 'hour'
if maxfreq == 'minute':
return 60, 'minute'
if maxfreq == None or maxfreq == 'second':
return 1, 'second'
################################################################
################################################################
################################################################
class TransformBase:
def set_data(self, data):
pass
def forward(self, y, x=None):
raise NotImplementedError
def backward(self, y, x=None):
raise NotImplementedError
class TransformDetrend(TransformBase):
"""
TransformDetrend is a transformer that detrends the data. It uses NumPy `polyfit` to find an `n` degree polynomial that removes the trend.
Args:
degree (int): Polynomial degree that will be fit, i.e. `2` will find a quadratic trend and remove it from the data.
"""
# TODO: add regression?
def __init__(self, degree=1):
self.degree = degree
def set_data(self, data):
if data.get_input_dims() != 1:
raise Exception("can only remove ranges on one dimensional input data")
self.coef = np.polyfit(data.X[data.mask,0], data.Y[data.mask], self.degree)
# reg = Ridge(alpha=0.1, fit_intercept=True)
# reg.fit(data.X, data.Y)
# self.trend = reg
def forward(self, y, x=None):
return y - np.polyval(self.coef, x[:, 0])
# return y - self.trend.predict(x)
def backward(self, y, x=None):
return y + np.polyval(self.coef, x[:, 0])
# return y + self.trend.predict(x)
class TransformLinear(TransformBase):
"""
TransformLinear transforms the data linearly so that y => (y-offset)/scale.
"""
def __init__(self, scale=1.0, offset=0.0):
self.scale = scale
self.offset = offset
def set_data(self, data):
pass
def forward(self, y, x=None):
return (y-self.offset)/self.scale
def backward(self, y, x=None):
return self.scale*y + self.offset
class TransformNormalize(TransformBase):
"""
TransformNormalize is a transformer that normalizes the data so that the y-axis is between -1 and 1.
"""
def __init__(self):
pass
def set_data(self, data):
self.ymin = np.amin(data.Y[data.mask])
self.ymax = np.amax(data.Y[data.mask])
def forward(self, y, x=None):
return -1.0 + 2.0*(y-self.ymin)/(self.ymax-self.ymin)
def backward(self, y, x=None):
return (y+1.0)/2.0*(self.ymax-self.ymin)+self.ymin
class TransformLog(TransformBase):
"""
TransformLog is a transformer that takes the log of the data. Data is automatically shifted in the y-axis so that all values are greater than or equal to 1.
"""
def __init__(self):
pass
def set_data(self, data):
self.shift = 1 - data.Y.min()
self.mean = np.log(data.Y + self.shift).mean()
def forward(self, y, x=None):
return np.log(y + self.shift) - self.mean
def backward(self, y, x=None):
return np.exp(y + self.mean) - self.shift
class TransformWhiten(TransformBase):
"""
Transform the data so it has mean 0 and variance 1
"""
def __init__(self):
pass
def set_data(self, data):
# take only the non-removed observations
self.mean = data.Y[data.mask].mean()
self.std = data.Y[data.mask].std()
def forward(self, y, x=None):
return (y - self.mean) / self.std
def backward(self, y, x=None):
return (y * self.std) + self.mean
################################################################
################################################################
################################################################
def LoadFunction(f, start, end, n, var=0.0, name="", random=False):
"""
LoadFunction loads a dataset from a given function y = f(x) + N(0,var). It will pick n data points between start and end for x, for which f is being evaluated. By default the n points are spread equally over the interval, with random=True they will be picked randomly.
The function should take one argument x with shape (n,input_dims) and return y with shape (n). If your data has only one input dimension, you can use x[:,0] to select only the first (and only) input dimension.
Args:
f (function): Function taking x with shape (n,input_dims) and returning shape (n) as y.
n (int): Number of data points to pick between start and end.
start (float, list): Define start of interval.
end (float, list): Define end of interval.
var (float, optional): Variance added to the output.
name (str, optional): Name of data.
random (boolean): Select points randomly between start and end (defaults to False).
Returns:
mogptk.data.Data
Examples:
>>> LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave')
<mogptk.data.Data at ...>
"""
# TODO: make work for multiple input dimensions, take n as a list
start = _normalize_input_dims(start, None)
input_dims = len(start)
if input_dims != 1:
raise ValueError("can only load function with one dimensional input data")
end = _normalize_input_dims(end, input_dims)
_check_function(f, input_dims)
x = np.empty((n, input_dims))
for i in range(input_dims):
if start[i] >= end[i]:
if input_dims == 1:
raise ValueError("start must be lower than end")
else:
raise ValueError("start must be lower than end for input dimension %d" % (i))
if random:
x[:,i] = np.random.uniform(start[i], end[i], n)
else:
x[:,i] = np.linspace(start[i], end[i], n)
y = f(x)
if y.ndim == 2 and y.shape[1] == 1:
y = y[:,0]
y += np.random.normal(0.0, var, n)
data = Data(x, y, name=name)
data.set_function(f)
return data
################################################################
################################################################
################################################################
class Data:
def __init__(self, X, Y, name=None, formats=None, x_labels=None, y_label=None):
"""
Data class holds all the observations, latent functions and prediction data.
This class takes the data raw, but you can load data also conveniently using
LoadFunction, LoadCSV, LoadDataFrame, etc. This class allows to modify the data before being passed into the model.
Examples are transforming data, such as detrending or taking the log, removing data range to simulate sensor failure,
and aggregating data for given spans on X, such as aggregating daily data into
weekly data. Additionally, we also use this class to set the range we want to predict.
Args:
X (list, numpy.ndarray, dict): Independent variable data of shape (n) or (n,input_dims).
Y (list, numpy.ndarray): Dependent variable data of shape (n).
name (str, optional): Name of data.
formats (dict, optional): List or dict of formatters (such as FormatNumber (default), FormatDate,
FormatDateTime, ...) for each input dimension.
x_labels (str, list of str, optional): Name or names of input dimensions.
y_label (str, optional): Name of output dimension.
Examples:
>>> channel = mogptk.Data([0, 1, 2, 3], [4, 3, 5, 6])
"""
# find out number of data rows (n) and number of input dimensions (input_dims)
n = 0
input_dims = 0
x_nested_lists = False
if isinstance(X, (list, np.ndarray, dict)) and 0 < len(X):
n = len(X)
input_dims = 1
if isinstance(X, dict):
it1 = iter(X.values())
it2 = iter(X.values())
else:
it1 = iter(X)
it2 = iter(X)
if all(isinstance(val, (list, np.ndarray)) for val in it1):
first = len(next(it2))
if all(len(val) == first for val in it2):
x_nested_lists = True
input_dims = first
if n == 0:
raise ValueError("X must contain at least one data row")
# convert dicts to lists
if x_labels != None:
if isinstance(x_labels, str) and input_dims == 1:
x_labels = [x_labels]
if not isinstance(x_labels, list) or not all(isinstance(label, str) for label in x_labels):
raise ValueError("x_labels must be a string or list of strings for each input dimension")
if isinstance(X, dict):
it = iter(X.values())
first = len(next(it))
if not all(isinstance(x, (list, np.ndarray)) for x in X.values()) or not all(len(x) == first for x in it):
raise ValueError("X dict should contain all lists or np.ndarrays where each has the same length")
if not all(key in X for key in x_labels):
raise ValueError("X dict must contain all keys listed in x_labels")
X = list(map(list, zip(*[X[key] for key in x_labels])))
if isinstance(formats, dict):
formats_list = []
for col in x_labels:
formats_list.append(formats[col])
if y_label != None:
formats_list.append(formats[y_label])
formats = formats_list
# format X columns
if formats == None:
formats = [FormatNumber()] * (input_dims+1)
if not isinstance(formats, list):
raise ValueError("formats should be list or dict for each input dimension, when a dict is passed than x_labels must also be set")
for col in range(input_dims+1):
if len(formats) <= col:
formats.append(FormatNumber())
elif isinstance(formats[col], type):
formats[col] = formats[col]()
bad_rows = set()
X_raw = X
X = np.empty((n,input_dims))
for row, val in enumerate(X_raw):
if x_nested_lists:
for col in range(input_dims):
try:
X[row,col] = formats[col].parse(val[col])
except ValueError:
bad_rows.add(row)
else:
try:
X[row,0] = formats[col].parse(val)
except ValueError:
bad_rows.add(row)
Y_raw = Y
Y = np.empty((n,))
for row, val in enumerate(Y_raw):
try:
Y[row] = formats[-1].parse(val)
except ValueError:
bad_rows.add(row)
if 0 < len(bad_rows):
bad_rows = list(bad_rows)
logger.info("could not parse values for %d rows, removing data points", len(bad_rows))
if len(bad_rows) == n:
raise ValueError("none of the data points could be parsed, are they valid numbers or is an appropriate formatter set?")
X = np.delete(X, bad_rows)
Y = np.delete(Y, bad_rows)
n -= len(bad_rows)
# check if X and Y are correct inputs
if isinstance(X, list):
if all(isinstance(x, list) for x in X):
m = len(X[0])
if not all(len(x) == m for x in X[1:]):
raise ValueError("X list items must all be lists of the same length")
if not all(all(isinstance(val, (int, float)) for val in x) for x in X):
raise ValueError("X list items must all be lists of numbers")
elif all(isinstance(x, np.ndarray) for x in X):
m = len(X[0])
if not all(len(x) == m for x in X[1:]):
raise ValueError("X list items must all be numpy.ndarrays of the same length")
elif not all(isinstance(x, (int, float)) for x in X):
raise ValueError("X list items must be all lists, all numpy.ndarrays, or all numbers")
X = np.array(X)
if isinstance(Y, list):
if not all(isinstance(y, (int, float)) for y in Y):
raise ValueError("Y list items must all be numbers")
Y = np.array(Y)
if not isinstance(X, np.ndarray) or not isinstance(Y, np.ndarray):
raise ValueError("X and Y must be lists or numpy arrays, if dicts are passed then x_labels and/or y_label must also be set")
if X.ndim == 1:
X = X.reshape(-1, 1)
if X.ndim != 2:
raise ValueError("X must be either a one or two dimensional array of data")
if Y.ndim != 1:
raise ValueError("Y must be a one dimensional array of data")
if X.shape[0] != Y.shape[0]:
raise ValueError("X and Y must be of the same length")
# sort on X for single input dimensions
if input_dims == 1:
ind = np.argsort(X, axis=0)
X = np.take_along_axis(X, ind, axis=0)
Y = np.take_along_axis(Y, ind[:,0], axis=0)
self.X = X # shape (n, input_dims)
self.Y = Y # shape (n)
self.mask = np.array([True] * n)
self.F = None
self.X_pred = X
self.Y_mu_pred = {}
self.Y_var_pred = {}
self.X_labels = [''] * input_dims
if isinstance(x_labels, list) and all(isinstance(item, str) for item in x_labels):
self.X_labels = x_labels
self.name = ''
if isinstance(name, str):
self.name = name
elif isinstance(y_label, str):
self.name = y_label
self.Y_label = ''
if isinstance(y_label, str):
self.Y_label = y_label
self.formatters = formats # list of formatters for all input dimensions, the last element is for the output dimension
self.transformations = [] # transformers for Y coordinates
self.X_offsets = np.array([0.0] * input_dims)
self.X_scales = np.array([1.0] * input_dims)
def __str__(self):
return self.__repr__()
def __repr__(self):
data = np.concatenate((self.X, self.Y.reshape(-1,1)), axis=1)
return repr(pd.DataFrame(data, columns=(self.X_labels + [self.Y_label])))
def set_name(self, name):
"""
Set name for data.
Args:
name (str): Name of data.
Examples:
>>> data.set_name('Channel A')
"""
self.name = name
def set_labels(self, x_labels, y_label):
"""
Set axes labels for plots.
Args:
x_labels (str, list of str): X data names for each input dimension.
y_label (str): Y data name for output dimension.
Examples:
>>> data.set_labels(['X', 'Y'], 'Cd')
"""
if isinstance(x_labels, str):
x_labels = [x_labels]
elif not isinstance(x_labels, list) or not all(isinstance(item, str) for item in x_labels):
raise ValueError("x_labels must be list of strings")
if not isinstance(y_label, str):
raise ValueError("y_label must be string")
if len(x_labels) != self.get_input_dims():
raise ValueError("x_labels must have the same input dimensions as the data")
self.X_labels = x_labels
self.Y_label = y_label
def set_function(self, f):
"""
Set a (latent) function for the data, ie. the theoretical or true signal. This is used for plotting purposes and is optional.
The function should take one argument x with shape (n,input_dims) and return y with shape (n). If your data has only one input dimension, you can use x[:,0] to select only the first (and only) input dimension.
Args:
f (function): Function taking x with shape (n,input_dims) and returning shape (n) as y.
Examples:
>>> data.set_function(lambda x: np.sin(3*x[:,0])
"""
_check_function(f, self.get_input_dims())
self.F = f
def set_x_scaling(self, offsets, scales):
"""
Set offset and scaling of X axis for each input dimension.
Args:
offsets (float, list or np.ndarray of floats): X offsets per input dimension.
scales (float, list or np.ndarray of floats): X scales per input dimension.
Examples:
>>> data.set_x_scaling([['X', 'Y'], 'Cd')
"""
if isinstance(offsets, float):
offsets = [offsets]
elif isinstance(offsets, np.ndarray):
offsets = list(offsets)
if not isinstance(offsets, list) or not all(isinstance(item, float) for item in offsets) or len(offsets) != self.get_input_dims():
raise ValueError("offsets must be a float, list or np.ndarray of floats and have the same input dimensions as the data")
if isinstance(scales, float):
scales = [scales]
elif isinstance(scales, np.ndarray):
scales = list(scales)
if not isinstance(scales, list) or not all(isinstance(item, float) for item in scales) or len(scales) != self.get_input_dims():
raise ValueError("scales must be a float, list or np.ndarray of floats and have the same input dimensions as the data")
self.X = self.X_offsets + (self.X/self.X_scales)
self.X_pred = self.X_offsets + (self.X_pred/self.X_scales)
self.X_offsets = offsets
self.X_scales = scales
self.X = self.X_scales*(self.X-self.X_offsets)
self.X_pred = self.X_scales*(self.X_pred-self.X_offsets)
def copy(self):
"""
Make a deep copy of Data.
Returns:
mogptk.data.Data
Examples:
>>> other = data.copy()
"""
return copy.deepcopy(self)
def transform(self, transformer):
"""
Transform the data by using one of the provided transformers, such as TransformDetrend, TransformNormalize, TransformLog, ...
Args:
transformer (obj): Transformer object with forward(y, x) and backward(y, x) methods.
Examples:
>>> data.transform(mogptk.TransformDetrend)
"""
t = transformer
if isinstance(t, type):
t = transformer()
t.set_data(self)
self.Y = t.forward(self.Y, self.X)
if self.F != None:
f = self.F
self.F = lambda x: t.forward(f(x), x)
self.transformations.append(t)
# TODO: remove?
def filter(self, start, end):
"""
Filter the data range to be between start and end. Start and end can be strings if a proper formatter is set for the independent variable.
Args:
start (float, str): Start of interval.
end (float, str): End of interval.
Examples:
>>> data = mogptk.LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave')
>>> data.filter(3, 8)
>>> data = mogptk.LoadCSV('gold.csv', 'Date', 'Price', format={'Date': mogptk.FormatDate})
>>> data.filter('2016-01-15', '2016-06-15')
"""
if self.get_input_dims() != 1:
raise ValueError("can only filter on one dimensional input data")
start = self.X_scales[0] * (self.formatters[0].parse(start) - self.X_offsets[0])
end = self.X_scales[0] * (self.formatters[0].parse(end) - self.X_offsets[0])
ind = (self.X[:,0] >= start) & (self.X[:,0] < end)
self.X = np.expand_dims(self.X[ind,0], 1)
self.Y = self.Y[ind]
self.mask = self.mask[ind]
# TODO: remove?
def aggregate(self, duration, f=np.mean):
"""
Aggregate the data by duration and apply a function to obtain a reduced dataset.
For example, group daily data by week and take the mean.
The duration can be set as a number which defined the intervals on the X axis,
or by a string written in the duration format with:
y=year, M=month, w=week, d=day, h=hour, m=minute, and s=second.
For example, 3w1d means three weeks and one day, ie. 22 days, or 6M to mean six months.
If using a number, be aware that when using FormatDate your X data is denoted per day,
while with FormatDateTime it is per second.
Args:
duration (float, str): Duration along the X axis or as a string in the duration format.
f (function, optional): Function to use to reduce data, by default uses np.mean.
Examples:
>>> data.aggregate(5)
>>> data.aggregate('2w', f=np.sum)
"""
if self.get_input_dims() != 1:
raise ValueError("can only aggregate on one dimensional input data")
start = self.X[0,0]
end = self.X[-1,0]
step = self.X_scales[0] * self.formatters[0].parse_delta(duration)
X = np.arange(start+step/2, end+step/2, step)
Y = np.empty((len(X)))
for i in range(len(X)):
ind = (self.X[:,0] >= X[i]-step/2) & (self.X[:,0] < X[i]+step/2)
Y[i] = f(self.Y[ind])
self.X = np.expand_dims(X, 1)
self.Y = Y
self.mask = np.array([True] * len(self.X))
################################################################
def get_name(self):
"""
Return the name.
Returns:
str.
Examples:
>>> data.get_name()
'A'
"""
return self.name
def has_test_data(self):
"""
Returns True if observations have been removed using the remove_* methods.
Returns:
boolean
Examples:
>>> data.has_test_data()
True
"""
return False in self.mask
def get_input_dims(self):
"""
Returns the number of input dimensions.
Returns:
int: Input dimensions.
Examples:
>>> data.get_input_dims()
2
"""
return self.X.shape[1]
def get_train_data(self):
"""
Returns the observations used for training.
Returns:
numpy.ndarray: X data of shape (n,input_dims).
numpy.ndarray: Y data of shape (n).
Examples:
>>> x, y = data.get_train_data()
"""
x = self.X[self.mask,:]
y = self.Y[self.mask]
return self.X_offsets + x/self.X_scales, self._detransform(y, x)
def get_data(self):
"""
Returns all observations, train and test.
Returns:
numpy.ndarray: X data of shape (n,input_dims).
numpy.ndarray: Y data of shape (n).
Examples:
>>> x, y = data.get_data()
"""
x = self.X
y = self.Y
return self.X_offsets + x/self.X_scales, self._detransform(y, x)
def get_test_data(self):
"""
Returns the observations used for testing.
Returns:
numpy.ndarray: X data of shape (n,input_dims).
numpy.ndarray: Y data of shape (n).
Examples:
>>> x, y = data.get_test_data()
"""
x = self.X[~self.mask,:]
y = self.Y[~self.mask]
return self.X_offsets + x/self.X_scales, self._detransform(y, x)
################################################################
def remove_randomly(self, n=None, pct=None):
"""
Removes observations randomly on the whole range. Either 'n' observations are removed, or a percentage of the observations.
Args:
n (int, optional): Number of observations to remove randomly.
pct (float, optional): Percentage in interval [0,1] of observations to remove randomly.
Examples:
>>> data.remove_randomly(50) # remove 50 observations
>>> data.remove_randomly(pct=0.9) # remove 90% of the observations
"""
if n == None:
if pct == None:
n = 0
else:
n = int(pct * self.X.shape[0])
idx = np.random.choice(self.X.shape[0], n, replace=False)
self.mask[idx] = False
def remove_range(self, start=None, end=None):
"""
Removes observations in the interval [start,end]. Start and end can be strings if a proper formatter is set for the independent variable.
Args:
start (float, str, optional): Start of interval. Defaults to first value in observations.
end (float, str, optional): End of interval. Defaults to last value in observations.
Examples:
>>> data = mogptk.LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave')
>>> data.remove_range(3, 8)
>>> data = mogptk.LoadCSV('gold.csv', 'Date', 'Price', format={'Date': mogptk.FormatDate})
>>> data.remove_range('2016-01-15', '2016-06-15')
"""
if self.get_input_dims() != 1:
raise Exception("can only remove ranges on one dimensional input data")
if start == None:
start = np.min(self.X[:,0])
else:
start = self.X_scales[0] * (self.formatters[0].parse(start) - self.X_offsets[0])
if end == None:
end = np.max(self.X[:,0])
else:
end = self.X_scales[0] * (self.formatters[0].parse(end) - self.X_offsets[0])
idx = np.where(np.logical_and(self.X[:,0] >= start, self.X[:,0] <= end))
self.mask[idx] = False
def remove_relative_range(self, start=None, end=None):
"""
Removes observations between start and end as a percentage of the number of observations. So '0' is the first observation, '0.5' is the middle observation, and '1' is the last observation.
Args:
start (float): Start percentage in interval [0,1].
end (float): End percentage in interval [0,1].
"""
if self.get_input_dims() != 1:
raise Exception("can only remove ranges on one dimensional input data")
if end is None:
end = 1
if start is None:
start = 0
x_min = np.min(self.X[:,0])
x_max = np.max(self.X[:,0])
start = x_min + max(0.0, min(1.0, start)) * (x_max-x_min)
end = x_min + max(0.0, min(1.0, end)) * (x_max-x_min)
idx = np.where(np.logical_and(self.X[:,0] >= start, self.X[:,0] <= end))
self.mask[idx] = False
def remove_random_ranges(self, n, duration):
"""
Removes a number of ranges to simulate sensor failure.
Args:
n (int): Number of ranges to remove.
duration (float, str): Width of ranges to remove, can use a number or the duration format syntax (see aggregate()).
Examples:
>>> data.remove_random_ranges(2, 5) # remove two ranges that are 5 wide in input space
>>> data.remove_random_ranges(3, '1d') # remove three ranges that are 1 day wide
"""
if self.get_input_dims() != 1:
raise Exception("can only remove ranges on one dimensional input data")
duration = self.formatters[0].parse_delta(duration)
if n < 1:
return
m = (self.X[-1]-self.X[0]) - n*duration
if m <= 0:
raise Exception("no data left after removing ranges")
locs = self.X[:,0] <= self.X[-1,0]-duration
locs[sum(locs)] = True # make sure the last data point can be deleted
for i in range(n):
x = self.X[locs][np.random.randint(len(self.X[locs]))]
locs[(self.X[:,0] > x-duration) & (self.X[:,0] < x+duration)] = False
self.mask[(self.X[:,0] >= x) & (self.X[:,0] < x+duration)] = False
################################################################
def get_prediction(self, name, sigma=2):
"""
Returns the prediction of a given name with a normal variance of sigma.
Args:
name (str): Name of the prediction, equals the name of the model that made the prediction.
sigma (float): The uncertainty interval calculated at mean-sigma*var and mean+sigma*var. Defaults to 2,
Returns:
numpy.ndarray: X prediction of shape (n,input_dims).
numpy.ndarray: Y mean prediction of shape (n,).
numpy.ndarray: Y lower prediction of uncertainty interval of shape (n,).
numpy.ndarray: Y upper prediction of uncertainty interval of shape (n,).
Examples:
>>> x, y_mean, y_var_lower, y_var_upper = data.get_prediction('MOSM', sigma=1)
"""
if name not in self.Y_mu_pred:
raise Exception("prediction name '%s' does not exist" % (name))
mu = self.Y_mu_pred[name]
lower = mu - sigma * np.sqrt(self.Y_var_pred[name])
upper = mu + sigma * np.sqrt(self.Y_var_pred[name])
mu = self._detransform(mu, self.X_pred)
lower = self._detransform(lower, self.X_pred)
upper = self._detransform(upper, self.X_pred)
return self.X_scales*(self.X_pred-self.X_offsets), mu, lower, upper
def set_prediction_range(self, start=None, end=None, n=None, step=None):
"""
Sets the prediction range.
The interval is set with [start,end], with either 'n' points or a
given 'step' between the points. Start and end can be set as strings and
step in the duration string format if the proper formatter is set.
Args:
start (float, str, optional): Start of interval, defaults to the first observation.
end (float, str, optional): End of interval, defaults to the last observation.
n (int, optional): Number of points to generate in the interval.
step (float, str, optional): Spacing between points in the interval.
If neither 'step' or 'n' is passed, default number of points is 100.
Examples:
>>> data = mogptk.LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave')
>>> data.set_prediction_range(3, 8, 200)
>>> data = mogptk.LoadCSV('gold.csv', 'Date', 'Price', formats={'Date': mogptk.FormatDate})
>>> data.set_prediction_range('2016-01-15', '2016-06-15', step='1d')
"""
if self.get_input_dims() != 1:
raise Exception("can only set prediction range on one dimensional input data")
if start == None:
start = self.X[0,:]
elif isinstance(start, list):
for i in range(self.get_input_dims()):
start[i] = self.formatters[i].parse(start[i])
else:
start = self.formatters[0].parse(start)
if end == None:
end = self.X[-1,:]
elif isinstance(end, list):
for i in range(self.get_input_dims()):
end[i] = self.formatters[i].parse(end[i])
else:
end = self.formatters[0].parse(end)
start = _normalize_input_dims(start, self.get_input_dims())
end = _normalize_input_dims(end, self.get_input_dims())
# TODO: works for multi input dims?
if end <= start:
raise ValueError("start must be lower than end")
# TODO: prediction range for multi input dimension; fix other axes to zero so we can plot?
self.X_pred = np.array([])
if step == None and n != None:
self.X_pred = np.empty((n, self.get_input_dims()))
for i in range(self.get_input_dims()):
self.X_pred[:,i] = np.linspace(start[i], end[i], n)
else:
if self.get_input_dims() != 1:
raise ValueError("cannot use step for multi dimensional input, use n")
if step == None:
step = (end[0]-start[0])/100
else:
step = self.formatters[0].parse_delta(step)
self.X_pred = np.arange(start[0], end[0]+step, step).reshape(-1, 1)
self.X_pred = self.X_scales*(self.X_pred-self.X_offsets)
def set_prediction_x(self, x):
"""
Set the prediction range directly.
Args:
x (list, numpy.ndarray): Array of shape (n) or (n,input_dims) with input values to predict at.
Examples:
>>> data.set_prediction_x([5.0, 5.5, 6.0, 6.5, 7.0])
"""
if isinstance(x, list):
x = np.array(x)
elif not isinstance(x, np.ndarray):
raise ValueError("x expected to be a list or numpy.ndarray")
x = x.astype(float)
if x.ndim == 1:
x = x.reshape(-1, 1)
if x.ndim != 2 or x.shape[1] != self.get_input_dims():
raise ValueError("x shape must be (n,input_dims)")
self.X_pred = self.X_scales*(x-self.X_offsets)
# clear old prediction data now that X_pred has been updated
self.Y_mu_pred = {}
self.Y_var_pred = {}
################################################################
def get_nyquist_estimation(self):
"""
Estimate nyquist frequency by taking 0.5/(minimum distance of points).
Returns:
numpy.ndarray: Nyquist frequency array of shape (input_dims,).
Examples:
>>> freqs = data.get_nyquist_estimation()
"""
input_dims = self.get_input_dims()
nyquist = np.empty((input_dims))
for i in range(self.get_input_dims()):
x = np.sort(self.X[self.mask,i])
dist = np.abs(x[1:]-x[:-1]) # TODO: assumes X is sorted, use average distance instead of minimal distance?
dist = np.min(dist[np.nonzero(dist)])
nyquist[i] = 0.5/dist
return nyquist
def get_bnse_estimation(self, Q=1, n=5000):
"""
Peaks estimation using BNSE (Bayesian Non-parametric Spectral Estimation).
Args:
Q (int): Number of peaks to find, defaults to 1.
n (int): Number of points of the grid to evaluate frequencies, defaults to 5000.
Returns:
numpy.ndarray: Amplitude array of shape (input_dims,Q).
numpy.ndarray: Frequency array of shape (input_dims,Q) in radians.
numpy.ndarray: Variance array of shape (input_dims,Q) in radians.
Examples:
>>> amplitudes, means, variances = data.get_bnse_estimation()
"""
input_dims = self.get_input_dims()
# Gaussian: f(x) = A * exp((x-B)^2 / (2C^2))
# Ie. A is the amplitude or peak height, B the mean or peak position, and C the variance or peak width
A = np.zeros((input_dims, Q))
B = np.zeros((input_dims, Q))
C = np.zeros((input_dims, Q))
nyquist = self.get_nyquist_estimation()
for i in range(input_dims):
x = self.X[self.mask, i]
y = self.Y[self.mask]
bnse = bse(x, y)
bnse.set_freqspace(nyquist[i], dimension=n)
bnse.train()
bnse.compute_moments()
amplitudes, positions, variances = bnse.get_freq_peaks()
# TODO: sqrt of amplitudes? vs LS?
if len(positions) == 0:
continue
n = len(positions)
if n < Q and n != 0:
# if there not enough peaks, we will repeat them
j = 0
while len(positions) < Q:
amplitudes = np.append(amplitudes, amplitudes[j])
positions = np.append(positions, positions[j])
variances = np.append(variances, variances[j])
j = (j+1) % n
A[i,:] = amplitudes[:Q]
B[i,:] = positions[:Q]
C[i,:] = variances[:Q]
return A, B, C
def get_lombscargle_estimation(self, Q=1, n=50000):
"""
Peak estimation using Lomb Scargle.
Args:
Q (int): Number of peaks to find, defaults to 1.
n (int): Number of points to use for Lomb Scargle, defaults to 50000.
Returns:
numpy.ndarray: Amplitude array of shape (input_dims,Q).
numpy.ndarray: Frequency array of shape (input_dims,Q) in radians.
numpy.ndarray: Variance array of shape (input_dims,Q) in radians.
Examples:
>>> amplitudes, means, variances = data.get_lombscargle_estimation()
"""
input_dims = self.get_input_dims()
# Gaussian: f(x) = A * exp((x-B)^2 / (2C^2))
# Ie. A is the amplitude or peak height, B the mean or peak position, and C the variance or peak width
A = np.zeros((input_dims, Q))
B = np.zeros((input_dims, Q))
C = np.zeros((input_dims, Q))
nyquist = self.get_nyquist_estimation() * 2 * np.pi
for i in range(input_dims):
x = np.linspace(0, nyquist[i], n+1)[1:]
dx = x[1]-x[0]
y = signal.lombscargle(self.X[self.mask,i], self.Y[self.mask], x)
ind, _ = signal.find_peaks(y)
ind = ind[np.argsort(y[ind])[::-1]] # sort by biggest peak first
widths, width_heights, _, _ = signal.peak_widths(y, ind, rel_height=0.5)
widths *= dx / np.pi / 2.0
positions = x[ind] / np.pi / 2.0
amplitudes = y[ind]
variances = widths / np.sqrt(8 * np.log(amplitudes / width_heights)) # from full-width half-maximum to Gaussian sigma
n = len(positions)
if n < Q and n != 0:
# if there not enough peaks, we will repeat them
j = 0
while len(positions) < Q:
amplitudes = np.append(amplitudes, amplitudes[j])
positions = np.append(positions, positions[j])
variances = np.append(variances, variances[j])
j = (j+1) % n
A[i,:] = amplitudes[:Q]
B[i,:] = positions[:Q]
C[i,:] = variances[:Q]
return A, B, C
def plot(self, ax=None, plot_legend=False):
"""
Plot the data including removed observations, latent function, and predictions.
Args:
ax (matplotlib.axes.Axes, optional): Draw to this axes, otherwise draw to the current axes.
Returns:
matplotlib.axes.Axes
"""
# TODO: ability to plot conditional or marginal distribution to reduce input dims
if self.get_input_dims() > 2:
raise Exception("cannot plot more than two input dimensions")
if self.get_input_dims() == 2:
raise Exception("two dimensional input data not yet implemented") # TODO
if ax == None:
ax = plt.gca()
X = self.X_offsets + (self.X/self.X_scales)
X_pred = self.X_offsets + (self.X_pred/self.X_scales)
legend = []
colors = list(matplotlib.colors.TABLEAU_COLORS)
for i, name in enumerate(self.Y_mu_pred):
if self.Y_mu_pred[name].size != 0:
lower = self.Y_mu_pred[name] - self.Y_var_pred[name]
upper = self.Y_mu_pred[name] + self.Y_var_pred[name]
ax.plot(X_pred[:,0], self.Y_mu_pred[name], ls='-', color=colors[i], lw=2)
ax.fill_between(X_pred[:,0], lower, upper, color=colors[i], alpha=0.1)
ax.plot(X_pred[:,0], lower, ls='-', color=colors[i], lw=1, alpha=0.5)
ax.plot(X_pred[:,0], upper, ls='-', color=colors[i], lw=1, alpha=0.5)
label = 'Prediction'
if 1 < len(self.Y_mu_pred):
label += ' ' + name
legend.append(plt.Line2D([0], [0], ls='-', color=colors[i], lw=2, label=label))
if self.F != None:
n = len(X[:,0])*10
x_min = np.min(X[:,0])
x_max = np.max(X[:,0])
if len(X_pred) != 0:
x_min = min(x_min, np.min(X_pred))
x_max = max(x_max, np.max(X_pred))
x = np.empty((n, 1))
x[:,0] = np.linspace(x_min, x_max, n)
y = self.F(x)
ax.plot(x[:,0], y, 'r--', lw=1)
legend.append(plt.Line2D([0], [0], ls='--', color='r', label='True'))
ax.plot(X[:,0], self.Y, 'k--', alpha=0.8)
legend.append(plt.Line2D([0], [0], ls='--', color='k', label='All Points'))
if self.has_test_data():
x, y = X[self.mask,:], self.Y[self.mask]
ax.plot(x[:,0], y, 'k.', mew=1, ms=13, markeredgecolor='white')
legend.append(plt.Line2D([0], [0], ls='', marker='.', color='k', mew=2, ms=10, label='Training Points'))
for idx in np.where(~self.mask)[0]:
width_1 = (self.X[min(idx, len(X) - 1), 0] - self.X[max(idx - 1, 0), 0]) / 2
width_2 = (self.X[min(idx + 1, len(X) - 1), 0] - self.X[max(idx, 0), 0]) / 2
ax.add_patch(
patches.Rectangle(
(self.X[max(idx - 1, 0), 0] + width_1, ax.get_ylim()[0]), # (x,y)
width_1 + width_2, # width
ax.get_ylim()[1] - ax.get_ylim()[0], # height
fill=True,
color='xkcd:strawberry',
alpha=0.25,
lw=0,
))
legend.append(patches.Rectangle(
(1, 1),
1,
1,
fill=True,
color='xkcd:strawberry',
alpha=0.5,
lw=0,
label='Removed Points'
))
xmin = X.min()
xmax = X.max()
ax.set_xlim(xmin - (xmax - xmin)*0.001, xmax + (xmax - xmin)*0.001)
ax.set_xlabel(self.X_labels[0])
ax.set_ylabel(self.Y_label)
ax.set_title(self.name)
formatter = matplotlib.ticker.FuncFormatter(lambda x,pos: self.formatters[0].format(x))
ax.xaxis.set_major_formatter(formatter)
if (len(legend) > 0) and plot_legend:
ax.legend(handles=legend, loc='upper center', ncol=len(legend), bbox_to_anchor=(0.5, 1.7))
return ax
def plot_spectrum(self, method='lombscargle', ax=None, per=None, maxfreq=None):
"""
Plot the spectrum of the data.
Args:
method (str, optional): Set the method to get the spectrum such as 'lombscargle'.
ax (matplotlib.axes.Axes, optional): Draw to this axes, otherwise draw to the current axes.
per (float, str): Set the scale of the X axis depending on the formatter used, eg. per=5 or per='3d' for three days.
maxfreq (float, optional): Maximum frequency to plot, otherwise the Nyquist frequency is used.
Returns:
matplotlib.axes.Axes
"""
# TODO: ability to plot conditional or marginal distribution to reduce input dims
if self.get_input_dims() > 2:
raise Exception("cannot plot more than two input dimensions")
if self.get_input_dims() == 2:
raise Exception("two dimensional input data not yet implemented") # TODO
if ax == None:
ax = plt.gca()
ax.set_title(self.name, fontsize=36)
formatter = self.formatters[0]
factor, name = formatter.get_scale(per)
if name != None:
ax.set_xlabel('Frequency (1/'+name+')')
else:
ax.set_xlabel('Frequency [Hz]')
X_space = np.squeeze((self.X_offsets + (self.X/self.X_scales)) / factor)
freq = maxfreq
if freq == None:
dist = np.abs(X_space[1:]-X_space[:-1])
freq = 1/np.average(dist)
X = np.linspace(0.0, freq, 10001)[1:]
Y_err = []
if method == 'lombscargle':
Y = signal.lombscargle(X_space, self.Y, X)
elif method == 'bnse':
# TODO: check if outcome is correct
nyquist = self.get_nyquist_estimation()
bnse = bse(X_space, self.Y)
bnse.set_freqspace(freq/2.0/np.pi, 10001)
bnse.train()
bnse.compute_moments()
Y = bnse.post_mean_r**2 + bnse.post_mean_i**2
Y_err = 2 * np.sqrt(np.diag(bnse.post_cov_r**2 + bnse.post_cov_i**2))
Y = Y[1:]
Y_err = Y_err[1:]
else:
raise ValueError('periodogram method "%s" does not exist' % (method))
ax.plot(X, Y, '-', color='xkcd:strawberry', lw=2.3)
if len(Y_err) != 0:
ax.fill_between(X, Y-Y_err, Y+Y_err, alpha=0.4)
ax.set_title(self.name + ' Spectrum')
xmin = X.min()
xmax = X.max()
ax.set_xlim(xmin - (xmax - xmin)*0.005, xmax + (xmax - xmin)*0.005)
ax.set_yticks([])
ax.set_ylim(0, None)
return ax
def _transform(self, y, x=None):
for t in self.transformations:
y = t.forward(y, x)
return y
def _detransform(self, y, x=None):
for t in self.transformations[::-1]:
y = t.backward(y, x)
return y
def _check_function(f, input_dims):
if not inspect.isfunction(f):
raise ValueError("function must take X as a parameter")
sig = inspect.signature(f)
if not len(sig.parameters) == 1:
raise ValueError("function must take X as a parameter")
x = np.ones((1, input_dims))
y = f(x)
if len(y.shape) != 1 or y.shape[0] != 1:
raise ValueError("function must return Y with shape (n), note that X has shape (n,input_dims)")
def _normalize_input_dims(x, input_dims):
if x == None:
return x
if isinstance(x, float):
x = [x]
elif isinstance(x, int):
x = [float(x)]
elif isinstance(x, str):
x = [x]
elif isinstance(x, np.ndarray):
x = list(x)
elif not isinstance(x, list):
raise ValueError("input should be a floating point, list or ndarray")
if input_dims != None and len(x) != input_dims:
raise ValueError("input must be a scalar for single-dimension input or a list of values for each input dimension")
return x
duration_regex = re.compile(
r'^((?P<years>[\.\d]+?)y)?'
r'((?P<months>[\.\d]+?)M)?'
r'((?P<weeks>[\.\d]+?)w)?'
r'((?P<days>[\.\d]+?)d)?'
r'((?P<hours>[\.\d]+?)h)?'
r'((?P<minutes>[\.\d]+?)m)?'
r'((?P<seconds>[\.\d]+?)s)?$')
def _parse_duration_to_sec(s):
x = duration_regex.match(s)
if x == None:
raise ValueError('duration string must be of the form 2h45m, allowed characters: (y)ear, (M)onth, (w)eek, (d)ay, (h)our, (m)inute, (s)econd')
sec = 0
matches = x.groups()[1::2]
if matches[0]:
sec += float(matches[0])*356.2425*24*3600
if matches[1]:
sec += float(matches[1])*30.4369*24*3600
if matches[2]:
sec += float(matches[2])*7*24*3600
if matches[3]:
sec += float(matches[3])*24*3600
if matches[4]:
sec += float(matches[4])*3600
if matches[5]:
sec += float(matches[5])*60
if matches[6]:
sec += float(matches[6])
return sec
Functions
def LoadFunction(f, start, end, n, var=0.0, name='', random=False)
-
LoadFunction loads a dataset from a given function y = f(x) + N(0,var). It will pick n data points between start and end for x, for which f is being evaluated. By default the n points are spread equally over the interval, with random=True they will be picked randomly.
The function should take one argument x with shape (n,input_dims) and return y with shape (n). If your data has only one input dimension, you can use x[:,0] to select only the first (and only) input dimension.
Args
f
:function
- Function taking x with shape (n,input_dims) and returning shape (n) as y.
n
:int
- Number of data points to pick between start and end.
start
:float
,list
- Define start of interval.
end
:float
,list
- Define end of interval.
var
:float
, optional- Variance added to the output.
name
:str
, optional- Name of data.
random
:boolean
- Select points randomly between start and end (defaults to False).
Returns
Data
Examples
>>> LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave') <mogptk.data.Data at ...>
Expand source code Browse git
def LoadFunction(f, start, end, n, var=0.0, name="", random=False): """ LoadFunction loads a dataset from a given function y = f(x) + N(0,var). It will pick n data points between start and end for x, for which f is being evaluated. By default the n points are spread equally over the interval, with random=True they will be picked randomly. The function should take one argument x with shape (n,input_dims) and return y with shape (n). If your data has only one input dimension, you can use x[:,0] to select only the first (and only) input dimension. Args: f (function): Function taking x with shape (n,input_dims) and returning shape (n) as y. n (int): Number of data points to pick between start and end. start (float, list): Define start of interval. end (float, list): Define end of interval. var (float, optional): Variance added to the output. name (str, optional): Name of data. random (boolean): Select points randomly between start and end (defaults to False). Returns: mogptk.data.Data Examples: >>> LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave') <mogptk.data.Data at ...> """ # TODO: make work for multiple input dimensions, take n as a list start = _normalize_input_dims(start, None) input_dims = len(start) if input_dims != 1: raise ValueError("can only load function with one dimensional input data") end = _normalize_input_dims(end, input_dims) _check_function(f, input_dims) x = np.empty((n, input_dims)) for i in range(input_dims): if start[i] >= end[i]: if input_dims == 1: raise ValueError("start must be lower than end") else: raise ValueError("start must be lower than end for input dimension %d" % (i)) if random: x[:,i] = np.random.uniform(start[i], end[i], n) else: x[:,i] = np.linspace(start[i], end[i], n) y = f(x) if y.ndim == 2 and y.shape[1] == 1: y = y[:,0] y += np.random.normal(0.0, var, n) data = Data(x, y, name=name) data.set_function(f) return data
Classes
class Data (X, Y, name=None, formats=None, x_labels=None, y_label=None)
-
Data class holds all the observations, latent functions and prediction data.
This class takes the data raw, but you can load data also conveniently using LoadFunction, LoadCSV, LoadDataFrame, etc. This class allows to modify the data before being passed into the model. Examples are transforming data, such as detrending or taking the log, removing data range to simulate sensor failure, and aggregating data for given spans on X, such as aggregating daily data into weekly data. Additionally, we also use this class to set the range we want to predict.
Args
X
:list
,numpy.ndarray
,dict
- Independent variable data of shape (n) or (n,input_dims).
Y
:list
,numpy.ndarray
- Dependent variable data of shape (n).
name
:str
, optional- Name of data.
formats
:dict
, optional- List or dict of formatters (such as FormatNumber (default), FormatDate, FormatDateTime, …) for each input dimension.
x_labels
:str
,list
ofstr
, optional- Name or names of input dimensions.
y_label
:str
, optional- Name of output dimension.
Examples
>>> channel = mogptk.Data([0, 1, 2, 3], [4, 3, 5, 6])
Expand source code Browse git
class Data: def __init__(self, X, Y, name=None, formats=None, x_labels=None, y_label=None): """ Data class holds all the observations, latent functions and prediction data. This class takes the data raw, but you can load data also conveniently using LoadFunction, LoadCSV, LoadDataFrame, etc. This class allows to modify the data before being passed into the model. Examples are transforming data, such as detrending or taking the log, removing data range to simulate sensor failure, and aggregating data for given spans on X, such as aggregating daily data into weekly data. Additionally, we also use this class to set the range we want to predict. Args: X (list, numpy.ndarray, dict): Independent variable data of shape (n) or (n,input_dims). Y (list, numpy.ndarray): Dependent variable data of shape (n). name (str, optional): Name of data. formats (dict, optional): List or dict of formatters (such as FormatNumber (default), FormatDate, FormatDateTime, ...) for each input dimension. x_labels (str, list of str, optional): Name or names of input dimensions. y_label (str, optional): Name of output dimension. Examples: >>> channel = mogptk.Data([0, 1, 2, 3], [4, 3, 5, 6]) """ # find out number of data rows (n) and number of input dimensions (input_dims) n = 0 input_dims = 0 x_nested_lists = False if isinstance(X, (list, np.ndarray, dict)) and 0 < len(X): n = len(X) input_dims = 1 if isinstance(X, dict): it1 = iter(X.values()) it2 = iter(X.values()) else: it1 = iter(X) it2 = iter(X) if all(isinstance(val, (list, np.ndarray)) for val in it1): first = len(next(it2)) if all(len(val) == first for val in it2): x_nested_lists = True input_dims = first if n == 0: raise ValueError("X must contain at least one data row") # convert dicts to lists if x_labels != None: if isinstance(x_labels, str) and input_dims == 1: x_labels = [x_labels] if not isinstance(x_labels, list) or not all(isinstance(label, str) for label in x_labels): raise ValueError("x_labels must be a string or list of strings for each input dimension") if isinstance(X, dict): it = iter(X.values()) first = len(next(it)) if not all(isinstance(x, (list, np.ndarray)) for x in X.values()) or not all(len(x) == first for x in it): raise ValueError("X dict should contain all lists or np.ndarrays where each has the same length") if not all(key in X for key in x_labels): raise ValueError("X dict must contain all keys listed in x_labels") X = list(map(list, zip(*[X[key] for key in x_labels]))) if isinstance(formats, dict): formats_list = [] for col in x_labels: formats_list.append(formats[col]) if y_label != None: formats_list.append(formats[y_label]) formats = formats_list # format X columns if formats == None: formats = [FormatNumber()] * (input_dims+1) if not isinstance(formats, list): raise ValueError("formats should be list or dict for each input dimension, when a dict is passed than x_labels must also be set") for col in range(input_dims+1): if len(formats) <= col: formats.append(FormatNumber()) elif isinstance(formats[col], type): formats[col] = formats[col]() bad_rows = set() X_raw = X X = np.empty((n,input_dims)) for row, val in enumerate(X_raw): if x_nested_lists: for col in range(input_dims): try: X[row,col] = formats[col].parse(val[col]) except ValueError: bad_rows.add(row) else: try: X[row,0] = formats[col].parse(val) except ValueError: bad_rows.add(row) Y_raw = Y Y = np.empty((n,)) for row, val in enumerate(Y_raw): try: Y[row] = formats[-1].parse(val) except ValueError: bad_rows.add(row) if 0 < len(bad_rows): bad_rows = list(bad_rows) logger.info("could not parse values for %d rows, removing data points", len(bad_rows)) if len(bad_rows) == n: raise ValueError("none of the data points could be parsed, are they valid numbers or is an appropriate formatter set?") X = np.delete(X, bad_rows) Y = np.delete(Y, bad_rows) n -= len(bad_rows) # check if X and Y are correct inputs if isinstance(X, list): if all(isinstance(x, list) for x in X): m = len(X[0]) if not all(len(x) == m for x in X[1:]): raise ValueError("X list items must all be lists of the same length") if not all(all(isinstance(val, (int, float)) for val in x) for x in X): raise ValueError("X list items must all be lists of numbers") elif all(isinstance(x, np.ndarray) for x in X): m = len(X[0]) if not all(len(x) == m for x in X[1:]): raise ValueError("X list items must all be numpy.ndarrays of the same length") elif not all(isinstance(x, (int, float)) for x in X): raise ValueError("X list items must be all lists, all numpy.ndarrays, or all numbers") X = np.array(X) if isinstance(Y, list): if not all(isinstance(y, (int, float)) for y in Y): raise ValueError("Y list items must all be numbers") Y = np.array(Y) if not isinstance(X, np.ndarray) or not isinstance(Y, np.ndarray): raise ValueError("X and Y must be lists or numpy arrays, if dicts are passed then x_labels and/or y_label must also be set") if X.ndim == 1: X = X.reshape(-1, 1) if X.ndim != 2: raise ValueError("X must be either a one or two dimensional array of data") if Y.ndim != 1: raise ValueError("Y must be a one dimensional array of data") if X.shape[0] != Y.shape[0]: raise ValueError("X and Y must be of the same length") # sort on X for single input dimensions if input_dims == 1: ind = np.argsort(X, axis=0) X = np.take_along_axis(X, ind, axis=0) Y = np.take_along_axis(Y, ind[:,0], axis=0) self.X = X # shape (n, input_dims) self.Y = Y # shape (n) self.mask = np.array([True] * n) self.F = None self.X_pred = X self.Y_mu_pred = {} self.Y_var_pred = {} self.X_labels = [''] * input_dims if isinstance(x_labels, list) and all(isinstance(item, str) for item in x_labels): self.X_labels = x_labels self.name = '' if isinstance(name, str): self.name = name elif isinstance(y_label, str): self.name = y_label self.Y_label = '' if isinstance(y_label, str): self.Y_label = y_label self.formatters = formats # list of formatters for all input dimensions, the last element is for the output dimension self.transformations = [] # transformers for Y coordinates self.X_offsets = np.array([0.0] * input_dims) self.X_scales = np.array([1.0] * input_dims) def __str__(self): return self.__repr__() def __repr__(self): data = np.concatenate((self.X, self.Y.reshape(-1,1)), axis=1) return repr(pd.DataFrame(data, columns=(self.X_labels + [self.Y_label]))) def set_name(self, name): """ Set name for data. Args: name (str): Name of data. Examples: >>> data.set_name('Channel A') """ self.name = name def set_labels(self, x_labels, y_label): """ Set axes labels for plots. Args: x_labels (str, list of str): X data names for each input dimension. y_label (str): Y data name for output dimension. Examples: >>> data.set_labels(['X', 'Y'], 'Cd') """ if isinstance(x_labels, str): x_labels = [x_labels] elif not isinstance(x_labels, list) or not all(isinstance(item, str) for item in x_labels): raise ValueError("x_labels must be list of strings") if not isinstance(y_label, str): raise ValueError("y_label must be string") if len(x_labels) != self.get_input_dims(): raise ValueError("x_labels must have the same input dimensions as the data") self.X_labels = x_labels self.Y_label = y_label def set_function(self, f): """ Set a (latent) function for the data, ie. the theoretical or true signal. This is used for plotting purposes and is optional. The function should take one argument x with shape (n,input_dims) and return y with shape (n). If your data has only one input dimension, you can use x[:,0] to select only the first (and only) input dimension. Args: f (function): Function taking x with shape (n,input_dims) and returning shape (n) as y. Examples: >>> data.set_function(lambda x: np.sin(3*x[:,0]) """ _check_function(f, self.get_input_dims()) self.F = f def set_x_scaling(self, offsets, scales): """ Set offset and scaling of X axis for each input dimension. Args: offsets (float, list or np.ndarray of floats): X offsets per input dimension. scales (float, list or np.ndarray of floats): X scales per input dimension. Examples: >>> data.set_x_scaling([['X', 'Y'], 'Cd') """ if isinstance(offsets, float): offsets = [offsets] elif isinstance(offsets, np.ndarray): offsets = list(offsets) if not isinstance(offsets, list) or not all(isinstance(item, float) for item in offsets) or len(offsets) != self.get_input_dims(): raise ValueError("offsets must be a float, list or np.ndarray of floats and have the same input dimensions as the data") if isinstance(scales, float): scales = [scales] elif isinstance(scales, np.ndarray): scales = list(scales) if not isinstance(scales, list) or not all(isinstance(item, float) for item in scales) or len(scales) != self.get_input_dims(): raise ValueError("scales must be a float, list or np.ndarray of floats and have the same input dimensions as the data") self.X = self.X_offsets + (self.X/self.X_scales) self.X_pred = self.X_offsets + (self.X_pred/self.X_scales) self.X_offsets = offsets self.X_scales = scales self.X = self.X_scales*(self.X-self.X_offsets) self.X_pred = self.X_scales*(self.X_pred-self.X_offsets) def copy(self): """ Make a deep copy of Data. Returns: mogptk.data.Data Examples: >>> other = data.copy() """ return copy.deepcopy(self) def transform(self, transformer): """ Transform the data by using one of the provided transformers, such as TransformDetrend, TransformNormalize, TransformLog, ... Args: transformer (obj): Transformer object with forward(y, x) and backward(y, x) methods. Examples: >>> data.transform(mogptk.TransformDetrend) """ t = transformer if isinstance(t, type): t = transformer() t.set_data(self) self.Y = t.forward(self.Y, self.X) if self.F != None: f = self.F self.F = lambda x: t.forward(f(x), x) self.transformations.append(t) # TODO: remove? def filter(self, start, end): """ Filter the data range to be between start and end. Start and end can be strings if a proper formatter is set for the independent variable. Args: start (float, str): Start of interval. end (float, str): End of interval. Examples: >>> data = mogptk.LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave') >>> data.filter(3, 8) >>> data = mogptk.LoadCSV('gold.csv', 'Date', 'Price', format={'Date': mogptk.FormatDate}) >>> data.filter('2016-01-15', '2016-06-15') """ if self.get_input_dims() != 1: raise ValueError("can only filter on one dimensional input data") start = self.X_scales[0] * (self.formatters[0].parse(start) - self.X_offsets[0]) end = self.X_scales[0] * (self.formatters[0].parse(end) - self.X_offsets[0]) ind = (self.X[:,0] >= start) & (self.X[:,0] < end) self.X = np.expand_dims(self.X[ind,0], 1) self.Y = self.Y[ind] self.mask = self.mask[ind] # TODO: remove? def aggregate(self, duration, f=np.mean): """ Aggregate the data by duration and apply a function to obtain a reduced dataset. For example, group daily data by week and take the mean. The duration can be set as a number which defined the intervals on the X axis, or by a string written in the duration format with: y=year, M=month, w=week, d=day, h=hour, m=minute, and s=second. For example, 3w1d means three weeks and one day, ie. 22 days, or 6M to mean six months. If using a number, be aware that when using FormatDate your X data is denoted per day, while with FormatDateTime it is per second. Args: duration (float, str): Duration along the X axis or as a string in the duration format. f (function, optional): Function to use to reduce data, by default uses np.mean. Examples: >>> data.aggregate(5) >>> data.aggregate('2w', f=np.sum) """ if self.get_input_dims() != 1: raise ValueError("can only aggregate on one dimensional input data") start = self.X[0,0] end = self.X[-1,0] step = self.X_scales[0] * self.formatters[0].parse_delta(duration) X = np.arange(start+step/2, end+step/2, step) Y = np.empty((len(X))) for i in range(len(X)): ind = (self.X[:,0] >= X[i]-step/2) & (self.X[:,0] < X[i]+step/2) Y[i] = f(self.Y[ind]) self.X = np.expand_dims(X, 1) self.Y = Y self.mask = np.array([True] * len(self.X)) ################################################################ def get_name(self): """ Return the name. Returns: str. Examples: >>> data.get_name() 'A' """ return self.name def has_test_data(self): """ Returns True if observations have been removed using the remove_* methods. Returns: boolean Examples: >>> data.has_test_data() True """ return False in self.mask def get_input_dims(self): """ Returns the number of input dimensions. Returns: int: Input dimensions. Examples: >>> data.get_input_dims() 2 """ return self.X.shape[1] def get_train_data(self): """ Returns the observations used for training. Returns: numpy.ndarray: X data of shape (n,input_dims). numpy.ndarray: Y data of shape (n). Examples: >>> x, y = data.get_train_data() """ x = self.X[self.mask,:] y = self.Y[self.mask] return self.X_offsets + x/self.X_scales, self._detransform(y, x) def get_data(self): """ Returns all observations, train and test. Returns: numpy.ndarray: X data of shape (n,input_dims). numpy.ndarray: Y data of shape (n). Examples: >>> x, y = data.get_data() """ x = self.X y = self.Y return self.X_offsets + x/self.X_scales, self._detransform(y, x) def get_test_data(self): """ Returns the observations used for testing. Returns: numpy.ndarray: X data of shape (n,input_dims). numpy.ndarray: Y data of shape (n). Examples: >>> x, y = data.get_test_data() """ x = self.X[~self.mask,:] y = self.Y[~self.mask] return self.X_offsets + x/self.X_scales, self._detransform(y, x) ################################################################ def remove_randomly(self, n=None, pct=None): """ Removes observations randomly on the whole range. Either 'n' observations are removed, or a percentage of the observations. Args: n (int, optional): Number of observations to remove randomly. pct (float, optional): Percentage in interval [0,1] of observations to remove randomly. Examples: >>> data.remove_randomly(50) # remove 50 observations >>> data.remove_randomly(pct=0.9) # remove 90% of the observations """ if n == None: if pct == None: n = 0 else: n = int(pct * self.X.shape[0]) idx = np.random.choice(self.X.shape[0], n, replace=False) self.mask[idx] = False def remove_range(self, start=None, end=None): """ Removes observations in the interval [start,end]. Start and end can be strings if a proper formatter is set for the independent variable. Args: start (float, str, optional): Start of interval. Defaults to first value in observations. end (float, str, optional): End of interval. Defaults to last value in observations. Examples: >>> data = mogptk.LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave') >>> data.remove_range(3, 8) >>> data = mogptk.LoadCSV('gold.csv', 'Date', 'Price', format={'Date': mogptk.FormatDate}) >>> data.remove_range('2016-01-15', '2016-06-15') """ if self.get_input_dims() != 1: raise Exception("can only remove ranges on one dimensional input data") if start == None: start = np.min(self.X[:,0]) else: start = self.X_scales[0] * (self.formatters[0].parse(start) - self.X_offsets[0]) if end == None: end = np.max(self.X[:,0]) else: end = self.X_scales[0] * (self.formatters[0].parse(end) - self.X_offsets[0]) idx = np.where(np.logical_and(self.X[:,0] >= start, self.X[:,0] <= end)) self.mask[idx] = False def remove_relative_range(self, start=None, end=None): """ Removes observations between start and end as a percentage of the number of observations. So '0' is the first observation, '0.5' is the middle observation, and '1' is the last observation. Args: start (float): Start percentage in interval [0,1]. end (float): End percentage in interval [0,1]. """ if self.get_input_dims() != 1: raise Exception("can only remove ranges on one dimensional input data") if end is None: end = 1 if start is None: start = 0 x_min = np.min(self.X[:,0]) x_max = np.max(self.X[:,0]) start = x_min + max(0.0, min(1.0, start)) * (x_max-x_min) end = x_min + max(0.0, min(1.0, end)) * (x_max-x_min) idx = np.where(np.logical_and(self.X[:,0] >= start, self.X[:,0] <= end)) self.mask[idx] = False def remove_random_ranges(self, n, duration): """ Removes a number of ranges to simulate sensor failure. Args: n (int): Number of ranges to remove. duration (float, str): Width of ranges to remove, can use a number or the duration format syntax (see aggregate()). Examples: >>> data.remove_random_ranges(2, 5) # remove two ranges that are 5 wide in input space >>> data.remove_random_ranges(3, '1d') # remove three ranges that are 1 day wide """ if self.get_input_dims() != 1: raise Exception("can only remove ranges on one dimensional input data") duration = self.formatters[0].parse_delta(duration) if n < 1: return m = (self.X[-1]-self.X[0]) - n*duration if m <= 0: raise Exception("no data left after removing ranges") locs = self.X[:,0] <= self.X[-1,0]-duration locs[sum(locs)] = True # make sure the last data point can be deleted for i in range(n): x = self.X[locs][np.random.randint(len(self.X[locs]))] locs[(self.X[:,0] > x-duration) & (self.X[:,0] < x+duration)] = False self.mask[(self.X[:,0] >= x) & (self.X[:,0] < x+duration)] = False ################################################################ def get_prediction(self, name, sigma=2): """ Returns the prediction of a given name with a normal variance of sigma. Args: name (str): Name of the prediction, equals the name of the model that made the prediction. sigma (float): The uncertainty interval calculated at mean-sigma*var and mean+sigma*var. Defaults to 2, Returns: numpy.ndarray: X prediction of shape (n,input_dims). numpy.ndarray: Y mean prediction of shape (n,). numpy.ndarray: Y lower prediction of uncertainty interval of shape (n,). numpy.ndarray: Y upper prediction of uncertainty interval of shape (n,). Examples: >>> x, y_mean, y_var_lower, y_var_upper = data.get_prediction('MOSM', sigma=1) """ if name not in self.Y_mu_pred: raise Exception("prediction name '%s' does not exist" % (name)) mu = self.Y_mu_pred[name] lower = mu - sigma * np.sqrt(self.Y_var_pred[name]) upper = mu + sigma * np.sqrt(self.Y_var_pred[name]) mu = self._detransform(mu, self.X_pred) lower = self._detransform(lower, self.X_pred) upper = self._detransform(upper, self.X_pred) return self.X_scales*(self.X_pred-self.X_offsets), mu, lower, upper def set_prediction_range(self, start=None, end=None, n=None, step=None): """ Sets the prediction range. The interval is set with [start,end], with either 'n' points or a given 'step' between the points. Start and end can be set as strings and step in the duration string format if the proper formatter is set. Args: start (float, str, optional): Start of interval, defaults to the first observation. end (float, str, optional): End of interval, defaults to the last observation. n (int, optional): Number of points to generate in the interval. step (float, str, optional): Spacing between points in the interval. If neither 'step' or 'n' is passed, default number of points is 100. Examples: >>> data = mogptk.LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave') >>> data.set_prediction_range(3, 8, 200) >>> data = mogptk.LoadCSV('gold.csv', 'Date', 'Price', formats={'Date': mogptk.FormatDate}) >>> data.set_prediction_range('2016-01-15', '2016-06-15', step='1d') """ if self.get_input_dims() != 1: raise Exception("can only set prediction range on one dimensional input data") if start == None: start = self.X[0,:] elif isinstance(start, list): for i in range(self.get_input_dims()): start[i] = self.formatters[i].parse(start[i]) else: start = self.formatters[0].parse(start) if end == None: end = self.X[-1,:] elif isinstance(end, list): for i in range(self.get_input_dims()): end[i] = self.formatters[i].parse(end[i]) else: end = self.formatters[0].parse(end) start = _normalize_input_dims(start, self.get_input_dims()) end = _normalize_input_dims(end, self.get_input_dims()) # TODO: works for multi input dims? if end <= start: raise ValueError("start must be lower than end") # TODO: prediction range for multi input dimension; fix other axes to zero so we can plot? self.X_pred = np.array([]) if step == None and n != None: self.X_pred = np.empty((n, self.get_input_dims())) for i in range(self.get_input_dims()): self.X_pred[:,i] = np.linspace(start[i], end[i], n) else: if self.get_input_dims() != 1: raise ValueError("cannot use step for multi dimensional input, use n") if step == None: step = (end[0]-start[0])/100 else: step = self.formatters[0].parse_delta(step) self.X_pred = np.arange(start[0], end[0]+step, step).reshape(-1, 1) self.X_pred = self.X_scales*(self.X_pred-self.X_offsets) def set_prediction_x(self, x): """ Set the prediction range directly. Args: x (list, numpy.ndarray): Array of shape (n) or (n,input_dims) with input values to predict at. Examples: >>> data.set_prediction_x([5.0, 5.5, 6.0, 6.5, 7.0]) """ if isinstance(x, list): x = np.array(x) elif not isinstance(x, np.ndarray): raise ValueError("x expected to be a list or numpy.ndarray") x = x.astype(float) if x.ndim == 1: x = x.reshape(-1, 1) if x.ndim != 2 or x.shape[1] != self.get_input_dims(): raise ValueError("x shape must be (n,input_dims)") self.X_pred = self.X_scales*(x-self.X_offsets) # clear old prediction data now that X_pred has been updated self.Y_mu_pred = {} self.Y_var_pred = {} ################################################################ def get_nyquist_estimation(self): """ Estimate nyquist frequency by taking 0.5/(minimum distance of points). Returns: numpy.ndarray: Nyquist frequency array of shape (input_dims,). Examples: >>> freqs = data.get_nyquist_estimation() """ input_dims = self.get_input_dims() nyquist = np.empty((input_dims)) for i in range(self.get_input_dims()): x = np.sort(self.X[self.mask,i]) dist = np.abs(x[1:]-x[:-1]) # TODO: assumes X is sorted, use average distance instead of minimal distance? dist = np.min(dist[np.nonzero(dist)]) nyquist[i] = 0.5/dist return nyquist def get_bnse_estimation(self, Q=1, n=5000): """ Peaks estimation using BNSE (Bayesian Non-parametric Spectral Estimation). Args: Q (int): Number of peaks to find, defaults to 1. n (int): Number of points of the grid to evaluate frequencies, defaults to 5000. Returns: numpy.ndarray: Amplitude array of shape (input_dims,Q). numpy.ndarray: Frequency array of shape (input_dims,Q) in radians. numpy.ndarray: Variance array of shape (input_dims,Q) in radians. Examples: >>> amplitudes, means, variances = data.get_bnse_estimation() """ input_dims = self.get_input_dims() # Gaussian: f(x) = A * exp((x-B)^2 / (2C^2)) # Ie. A is the amplitude or peak height, B the mean or peak position, and C the variance or peak width A = np.zeros((input_dims, Q)) B = np.zeros((input_dims, Q)) C = np.zeros((input_dims, Q)) nyquist = self.get_nyquist_estimation() for i in range(input_dims): x = self.X[self.mask, i] y = self.Y[self.mask] bnse = bse(x, y) bnse.set_freqspace(nyquist[i], dimension=n) bnse.train() bnse.compute_moments() amplitudes, positions, variances = bnse.get_freq_peaks() # TODO: sqrt of amplitudes? vs LS? if len(positions) == 0: continue n = len(positions) if n < Q and n != 0: # if there not enough peaks, we will repeat them j = 0 while len(positions) < Q: amplitudes = np.append(amplitudes, amplitudes[j]) positions = np.append(positions, positions[j]) variances = np.append(variances, variances[j]) j = (j+1) % n A[i,:] = amplitudes[:Q] B[i,:] = positions[:Q] C[i,:] = variances[:Q] return A, B, C def get_lombscargle_estimation(self, Q=1, n=50000): """ Peak estimation using Lomb Scargle. Args: Q (int): Number of peaks to find, defaults to 1. n (int): Number of points to use for Lomb Scargle, defaults to 50000. Returns: numpy.ndarray: Amplitude array of shape (input_dims,Q). numpy.ndarray: Frequency array of shape (input_dims,Q) in radians. numpy.ndarray: Variance array of shape (input_dims,Q) in radians. Examples: >>> amplitudes, means, variances = data.get_lombscargle_estimation() """ input_dims = self.get_input_dims() # Gaussian: f(x) = A * exp((x-B)^2 / (2C^2)) # Ie. A is the amplitude or peak height, B the mean or peak position, and C the variance or peak width A = np.zeros((input_dims, Q)) B = np.zeros((input_dims, Q)) C = np.zeros((input_dims, Q)) nyquist = self.get_nyquist_estimation() * 2 * np.pi for i in range(input_dims): x = np.linspace(0, nyquist[i], n+1)[1:] dx = x[1]-x[0] y = signal.lombscargle(self.X[self.mask,i], self.Y[self.mask], x) ind, _ = signal.find_peaks(y) ind = ind[np.argsort(y[ind])[::-1]] # sort by biggest peak first widths, width_heights, _, _ = signal.peak_widths(y, ind, rel_height=0.5) widths *= dx / np.pi / 2.0 positions = x[ind] / np.pi / 2.0 amplitudes = y[ind] variances = widths / np.sqrt(8 * np.log(amplitudes / width_heights)) # from full-width half-maximum to Gaussian sigma n = len(positions) if n < Q and n != 0: # if there not enough peaks, we will repeat them j = 0 while len(positions) < Q: amplitudes = np.append(amplitudes, amplitudes[j]) positions = np.append(positions, positions[j]) variances = np.append(variances, variances[j]) j = (j+1) % n A[i,:] = amplitudes[:Q] B[i,:] = positions[:Q] C[i,:] = variances[:Q] return A, B, C def plot(self, ax=None, plot_legend=False): """ Plot the data including removed observations, latent function, and predictions. Args: ax (matplotlib.axes.Axes, optional): Draw to this axes, otherwise draw to the current axes. Returns: matplotlib.axes.Axes """ # TODO: ability to plot conditional or marginal distribution to reduce input dims if self.get_input_dims() > 2: raise Exception("cannot plot more than two input dimensions") if self.get_input_dims() == 2: raise Exception("two dimensional input data not yet implemented") # TODO if ax == None: ax = plt.gca() X = self.X_offsets + (self.X/self.X_scales) X_pred = self.X_offsets + (self.X_pred/self.X_scales) legend = [] colors = list(matplotlib.colors.TABLEAU_COLORS) for i, name in enumerate(self.Y_mu_pred): if self.Y_mu_pred[name].size != 0: lower = self.Y_mu_pred[name] - self.Y_var_pred[name] upper = self.Y_mu_pred[name] + self.Y_var_pred[name] ax.plot(X_pred[:,0], self.Y_mu_pred[name], ls='-', color=colors[i], lw=2) ax.fill_between(X_pred[:,0], lower, upper, color=colors[i], alpha=0.1) ax.plot(X_pred[:,0], lower, ls='-', color=colors[i], lw=1, alpha=0.5) ax.plot(X_pred[:,0], upper, ls='-', color=colors[i], lw=1, alpha=0.5) label = 'Prediction' if 1 < len(self.Y_mu_pred): label += ' ' + name legend.append(plt.Line2D([0], [0], ls='-', color=colors[i], lw=2, label=label)) if self.F != None: n = len(X[:,0])*10 x_min = np.min(X[:,0]) x_max = np.max(X[:,0]) if len(X_pred) != 0: x_min = min(x_min, np.min(X_pred)) x_max = max(x_max, np.max(X_pred)) x = np.empty((n, 1)) x[:,0] = np.linspace(x_min, x_max, n) y = self.F(x) ax.plot(x[:,0], y, 'r--', lw=1) legend.append(plt.Line2D([0], [0], ls='--', color='r', label='True')) ax.plot(X[:,0], self.Y, 'k--', alpha=0.8) legend.append(plt.Line2D([0], [0], ls='--', color='k', label='All Points')) if self.has_test_data(): x, y = X[self.mask,:], self.Y[self.mask] ax.plot(x[:,0], y, 'k.', mew=1, ms=13, markeredgecolor='white') legend.append(plt.Line2D([0], [0], ls='', marker='.', color='k', mew=2, ms=10, label='Training Points')) for idx in np.where(~self.mask)[0]: width_1 = (self.X[min(idx, len(X) - 1), 0] - self.X[max(idx - 1, 0), 0]) / 2 width_2 = (self.X[min(idx + 1, len(X) - 1), 0] - self.X[max(idx, 0), 0]) / 2 ax.add_patch( patches.Rectangle( (self.X[max(idx - 1, 0), 0] + width_1, ax.get_ylim()[0]), # (x,y) width_1 + width_2, # width ax.get_ylim()[1] - ax.get_ylim()[0], # height fill=True, color='xkcd:strawberry', alpha=0.25, lw=0, )) legend.append(patches.Rectangle( (1, 1), 1, 1, fill=True, color='xkcd:strawberry', alpha=0.5, lw=0, label='Removed Points' )) xmin = X.min() xmax = X.max() ax.set_xlim(xmin - (xmax - xmin)*0.001, xmax + (xmax - xmin)*0.001) ax.set_xlabel(self.X_labels[0]) ax.set_ylabel(self.Y_label) ax.set_title(self.name) formatter = matplotlib.ticker.FuncFormatter(lambda x,pos: self.formatters[0].format(x)) ax.xaxis.set_major_formatter(formatter) if (len(legend) > 0) and plot_legend: ax.legend(handles=legend, loc='upper center', ncol=len(legend), bbox_to_anchor=(0.5, 1.7)) return ax def plot_spectrum(self, method='lombscargle', ax=None, per=None, maxfreq=None): """ Plot the spectrum of the data. Args: method (str, optional): Set the method to get the spectrum such as 'lombscargle'. ax (matplotlib.axes.Axes, optional): Draw to this axes, otherwise draw to the current axes. per (float, str): Set the scale of the X axis depending on the formatter used, eg. per=5 or per='3d' for three days. maxfreq (float, optional): Maximum frequency to plot, otherwise the Nyquist frequency is used. Returns: matplotlib.axes.Axes """ # TODO: ability to plot conditional or marginal distribution to reduce input dims if self.get_input_dims() > 2: raise Exception("cannot plot more than two input dimensions") if self.get_input_dims() == 2: raise Exception("two dimensional input data not yet implemented") # TODO if ax == None: ax = plt.gca() ax.set_title(self.name, fontsize=36) formatter = self.formatters[0] factor, name = formatter.get_scale(per) if name != None: ax.set_xlabel('Frequency (1/'+name+')') else: ax.set_xlabel('Frequency [Hz]') X_space = np.squeeze((self.X_offsets + (self.X/self.X_scales)) / factor) freq = maxfreq if freq == None: dist = np.abs(X_space[1:]-X_space[:-1]) freq = 1/np.average(dist) X = np.linspace(0.0, freq, 10001)[1:] Y_err = [] if method == 'lombscargle': Y = signal.lombscargle(X_space, self.Y, X) elif method == 'bnse': # TODO: check if outcome is correct nyquist = self.get_nyquist_estimation() bnse = bse(X_space, self.Y) bnse.set_freqspace(freq/2.0/np.pi, 10001) bnse.train() bnse.compute_moments() Y = bnse.post_mean_r**2 + bnse.post_mean_i**2 Y_err = 2 * np.sqrt(np.diag(bnse.post_cov_r**2 + bnse.post_cov_i**2)) Y = Y[1:] Y_err = Y_err[1:] else: raise ValueError('periodogram method "%s" does not exist' % (method)) ax.plot(X, Y, '-', color='xkcd:strawberry', lw=2.3) if len(Y_err) != 0: ax.fill_between(X, Y-Y_err, Y+Y_err, alpha=0.4) ax.set_title(self.name + ' Spectrum') xmin = X.min() xmax = X.max() ax.set_xlim(xmin - (xmax - xmin)*0.005, xmax + (xmax - xmin)*0.005) ax.set_yticks([]) ax.set_ylim(0, None) return ax def _transform(self, y, x=None): for t in self.transformations: y = t.forward(y, x) return y def _detransform(self, y, x=None): for t in self.transformations[::-1]: y = t.backward(y, x) return y
Methods
def aggregate(self, duration, f=<function mean>)
-
Aggregate the data by duration and apply a function to obtain a reduced dataset.
For example, group daily data by week and take the mean. The duration can be set as a number which defined the intervals on the X axis, or by a string written in the duration format with: y=year, M=month, w=week, d=day, h=hour, m=minute, and s=second. For example, 3w1d means three weeks and one day, ie. 22 days, or 6M to mean six months. If using a number, be aware that when using FormatDate your X data is denoted per day, while with FormatDateTime it is per second.
Args
duration
:float
,str
- Duration along the X axis or as a string in the duration format.
f
:function
, optional- Function to use to reduce data, by default uses np.mean.
Examples
>>> data.aggregate(5) >>> data.aggregate('2w', f=np.sum)
Expand source code Browse git
def aggregate(self, duration, f=np.mean): """ Aggregate the data by duration and apply a function to obtain a reduced dataset. For example, group daily data by week and take the mean. The duration can be set as a number which defined the intervals on the X axis, or by a string written in the duration format with: y=year, M=month, w=week, d=day, h=hour, m=minute, and s=second. For example, 3w1d means three weeks and one day, ie. 22 days, or 6M to mean six months. If using a number, be aware that when using FormatDate your X data is denoted per day, while with FormatDateTime it is per second. Args: duration (float, str): Duration along the X axis or as a string in the duration format. f (function, optional): Function to use to reduce data, by default uses np.mean. Examples: >>> data.aggregate(5) >>> data.aggregate('2w', f=np.sum) """ if self.get_input_dims() != 1: raise ValueError("can only aggregate on one dimensional input data") start = self.X[0,0] end = self.X[-1,0] step = self.X_scales[0] * self.formatters[0].parse_delta(duration) X = np.arange(start+step/2, end+step/2, step) Y = np.empty((len(X))) for i in range(len(X)): ind = (self.X[:,0] >= X[i]-step/2) & (self.X[:,0] < X[i]+step/2) Y[i] = f(self.Y[ind]) self.X = np.expand_dims(X, 1) self.Y = Y self.mask = np.array([True] * len(self.X))
def copy(self)
-
Make a deep copy of Data.
Returns
Data
Examples
>>> other = data.copy()
Expand source code Browse git
def copy(self): """ Make a deep copy of Data. Returns: mogptk.data.Data Examples: >>> other = data.copy() """ return copy.deepcopy(self)
def filter(self, start, end)
-
Filter the data range to be between start and end. Start and end can be strings if a proper formatter is set for the independent variable.
Args
start
:float
,str
- Start of interval.
end
:float
,str
- End of interval.
Examples
>>> data = mogptk.LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave') >>> data.filter(3, 8) >>> data = mogptk.LoadCSV('gold.csv', 'Date', 'Price', format={'Date': mogptk.FormatDate}) >>> data.filter('2016-01-15', '2016-06-15')
Expand source code Browse git
def filter(self, start, end): """ Filter the data range to be between start and end. Start and end can be strings if a proper formatter is set for the independent variable. Args: start (float, str): Start of interval. end (float, str): End of interval. Examples: >>> data = mogptk.LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave') >>> data.filter(3, 8) >>> data = mogptk.LoadCSV('gold.csv', 'Date', 'Price', format={'Date': mogptk.FormatDate}) >>> data.filter('2016-01-15', '2016-06-15') """ if self.get_input_dims() != 1: raise ValueError("can only filter on one dimensional input data") start = self.X_scales[0] * (self.formatters[0].parse(start) - self.X_offsets[0]) end = self.X_scales[0] * (self.formatters[0].parse(end) - self.X_offsets[0]) ind = (self.X[:,0] >= start) & (self.X[:,0] < end) self.X = np.expand_dims(self.X[ind,0], 1) self.Y = self.Y[ind] self.mask = self.mask[ind]
def get_bnse_estimation(self, Q=1, n=5000)
-
Peaks estimation using BNSE (Bayesian Non-parametric Spectral Estimation).
Args
Q
:int
- Number of peaks to find, defaults to 1.
n
:int
- Number of points of the grid to evaluate frequencies, defaults to 5000.
Returns
numpy.ndarray: Amplitude array of shape (input_dims,Q). numpy.ndarray: Frequency array of shape (input_dims,Q) in radians. numpy.ndarray: Variance array of shape (input_dims,Q) in radians.
Examples
>>> amplitudes, means, variances = data.get_bnse_estimation()
Expand source code Browse git
def get_bnse_estimation(self, Q=1, n=5000): """ Peaks estimation using BNSE (Bayesian Non-parametric Spectral Estimation). Args: Q (int): Number of peaks to find, defaults to 1. n (int): Number of points of the grid to evaluate frequencies, defaults to 5000. Returns: numpy.ndarray: Amplitude array of shape (input_dims,Q). numpy.ndarray: Frequency array of shape (input_dims,Q) in radians. numpy.ndarray: Variance array of shape (input_dims,Q) in radians. Examples: >>> amplitudes, means, variances = data.get_bnse_estimation() """ input_dims = self.get_input_dims() # Gaussian: f(x) = A * exp((x-B)^2 / (2C^2)) # Ie. A is the amplitude or peak height, B the mean or peak position, and C the variance or peak width A = np.zeros((input_dims, Q)) B = np.zeros((input_dims, Q)) C = np.zeros((input_dims, Q)) nyquist = self.get_nyquist_estimation() for i in range(input_dims): x = self.X[self.mask, i] y = self.Y[self.mask] bnse = bse(x, y) bnse.set_freqspace(nyquist[i], dimension=n) bnse.train() bnse.compute_moments() amplitudes, positions, variances = bnse.get_freq_peaks() # TODO: sqrt of amplitudes? vs LS? if len(positions) == 0: continue n = len(positions) if n < Q and n != 0: # if there not enough peaks, we will repeat them j = 0 while len(positions) < Q: amplitudes = np.append(amplitudes, amplitudes[j]) positions = np.append(positions, positions[j]) variances = np.append(variances, variances[j]) j = (j+1) % n A[i,:] = amplitudes[:Q] B[i,:] = positions[:Q] C[i,:] = variances[:Q] return A, B, C
def get_data(self)
-
Returns all observations, train and test.
Returns
numpy.ndarray: X data of shape (n,input_dims). numpy.ndarray: Y data of shape (n).
Examples
>>> x, y = data.get_data()
Expand source code Browse git
def get_data(self): """ Returns all observations, train and test. Returns: numpy.ndarray: X data of shape (n,input_dims). numpy.ndarray: Y data of shape (n). Examples: >>> x, y = data.get_data() """ x = self.X y = self.Y return self.X_offsets + x/self.X_scales, self._detransform(y, x)
def get_input_dims(self)
-
Returns the number of input dimensions.
Returns
int
- Input dimensions.
Examples
>>> data.get_input_dims() **`2`**
Expand source code Browse git
def get_input_dims(self): """ Returns the number of input dimensions. Returns: int: Input dimensions. Examples: >>> data.get_input_dims() 2 """ return self.X.shape[1]
def get_lombscargle_estimation(self, Q=1, n=50000)
-
Peak estimation using Lomb Scargle.
Args
Q
:int
- Number of peaks to find, defaults to 1.
n
:int
- Number of points to use for Lomb Scargle, defaults to 50000.
Returns
numpy.ndarray: Amplitude array of shape (input_dims,Q). numpy.ndarray: Frequency array of shape (input_dims,Q) in radians. numpy.ndarray: Variance array of shape (input_dims,Q) in radians.
Examples
>>> amplitudes, means, variances = data.get_lombscargle_estimation()
Expand source code Browse git
def get_lombscargle_estimation(self, Q=1, n=50000): """ Peak estimation using Lomb Scargle. Args: Q (int): Number of peaks to find, defaults to 1. n (int): Number of points to use for Lomb Scargle, defaults to 50000. Returns: numpy.ndarray: Amplitude array of shape (input_dims,Q). numpy.ndarray: Frequency array of shape (input_dims,Q) in radians. numpy.ndarray: Variance array of shape (input_dims,Q) in radians. Examples: >>> amplitudes, means, variances = data.get_lombscargle_estimation() """ input_dims = self.get_input_dims() # Gaussian: f(x) = A * exp((x-B)^2 / (2C^2)) # Ie. A is the amplitude or peak height, B the mean or peak position, and C the variance or peak width A = np.zeros((input_dims, Q)) B = np.zeros((input_dims, Q)) C = np.zeros((input_dims, Q)) nyquist = self.get_nyquist_estimation() * 2 * np.pi for i in range(input_dims): x = np.linspace(0, nyquist[i], n+1)[1:] dx = x[1]-x[0] y = signal.lombscargle(self.X[self.mask,i], self.Y[self.mask], x) ind, _ = signal.find_peaks(y) ind = ind[np.argsort(y[ind])[::-1]] # sort by biggest peak first widths, width_heights, _, _ = signal.peak_widths(y, ind, rel_height=0.5) widths *= dx / np.pi / 2.0 positions = x[ind] / np.pi / 2.0 amplitudes = y[ind] variances = widths / np.sqrt(8 * np.log(amplitudes / width_heights)) # from full-width half-maximum to Gaussian sigma n = len(positions) if n < Q and n != 0: # if there not enough peaks, we will repeat them j = 0 while len(positions) < Q: amplitudes = np.append(amplitudes, amplitudes[j]) positions = np.append(positions, positions[j]) variances = np.append(variances, variances[j]) j = (j+1) % n A[i,:] = amplitudes[:Q] B[i,:] = positions[:Q] C[i,:] = variances[:Q] return A, B, C
def get_name(self)
-
Return the name.
Returns
str.
Examples
>>> data.get_name() 'A'
Expand source code Browse git
def get_name(self): """ Return the name. Returns: str. Examples: >>> data.get_name() 'A' """ return self.name
def get_nyquist_estimation(self)
-
Estimate nyquist frequency by taking 0.5/(minimum distance of points).
Returns
numpy.ndarray: Nyquist frequency array of shape (input_dims,).
Examples
>>> freqs = data.get_nyquist_estimation()
Expand source code Browse git
def get_nyquist_estimation(self): """ Estimate nyquist frequency by taking 0.5/(minimum distance of points). Returns: numpy.ndarray: Nyquist frequency array of shape (input_dims,). Examples: >>> freqs = data.get_nyquist_estimation() """ input_dims = self.get_input_dims() nyquist = np.empty((input_dims)) for i in range(self.get_input_dims()): x = np.sort(self.X[self.mask,i]) dist = np.abs(x[1:]-x[:-1]) # TODO: assumes X is sorted, use average distance instead of minimal distance? dist = np.min(dist[np.nonzero(dist)]) nyquist[i] = 0.5/dist return nyquist
def get_prediction(self, name, sigma=2)
-
Returns the prediction of a given name with a normal variance of sigma.
Args
name
:str
- Name of the prediction, equals the name of the model that made the prediction.
sigma
:float
- The uncertainty interval calculated at mean-sigmavar and mean+sigmavar. Defaults to 2,
Returns
numpy.ndarray: X prediction of shape (n,input_dims). numpy.ndarray: Y mean prediction of shape (n,). numpy.ndarray: Y lower prediction of uncertainty interval of shape (n,). numpy.ndarray: Y upper prediction of uncertainty interval of shape (n,).
Examples
>>> x, y_mean, y_var_lower, y_var_upper = data.get_prediction('MOSM', sigma=1)
Expand source code Browse git
def get_prediction(self, name, sigma=2): """ Returns the prediction of a given name with a normal variance of sigma. Args: name (str): Name of the prediction, equals the name of the model that made the prediction. sigma (float): The uncertainty interval calculated at mean-sigma*var and mean+sigma*var. Defaults to 2, Returns: numpy.ndarray: X prediction of shape (n,input_dims). numpy.ndarray: Y mean prediction of shape (n,). numpy.ndarray: Y lower prediction of uncertainty interval of shape (n,). numpy.ndarray: Y upper prediction of uncertainty interval of shape (n,). Examples: >>> x, y_mean, y_var_lower, y_var_upper = data.get_prediction('MOSM', sigma=1) """ if name not in self.Y_mu_pred: raise Exception("prediction name '%s' does not exist" % (name)) mu = self.Y_mu_pred[name] lower = mu - sigma * np.sqrt(self.Y_var_pred[name]) upper = mu + sigma * np.sqrt(self.Y_var_pred[name]) mu = self._detransform(mu, self.X_pred) lower = self._detransform(lower, self.X_pred) upper = self._detransform(upper, self.X_pred) return self.X_scales*(self.X_pred-self.X_offsets), mu, lower, upper
def get_test_data(self)
-
Returns the observations used for testing.
Returns
numpy.ndarray: X data of shape (n,input_dims). numpy.ndarray: Y data of shape (n).
Examples
>>> x, y = data.get_test_data()
Expand source code Browse git
def get_test_data(self): """ Returns the observations used for testing. Returns: numpy.ndarray: X data of shape (n,input_dims). numpy.ndarray: Y data of shape (n). Examples: >>> x, y = data.get_test_data() """ x = self.X[~self.mask,:] y = self.Y[~self.mask] return self.X_offsets + x/self.X_scales, self._detransform(y, x)
def get_train_data(self)
-
Returns the observations used for training.
Returns
numpy.ndarray: X data of shape (n,input_dims). numpy.ndarray: Y data of shape (n).
Examples
>>> x, y = data.get_train_data()
Expand source code Browse git
def get_train_data(self): """ Returns the observations used for training. Returns: numpy.ndarray: X data of shape (n,input_dims). numpy.ndarray: Y data of shape (n). Examples: >>> x, y = data.get_train_data() """ x = self.X[self.mask,:] y = self.Y[self.mask] return self.X_offsets + x/self.X_scales, self._detransform(y, x)
def has_test_data(self)
-
Returns True if observations have been removed using the remove_* methods.
Returns
boolean
Examples
>>> data.has_test_data() **`True`**
Expand source code Browse git
def has_test_data(self): """ Returns True if observations have been removed using the remove_* methods. Returns: boolean Examples: >>> data.has_test_data() True """ return False in self.mask
def plot(self, ax=None, plot_legend=False)
-
Plot the data including removed observations, latent function, and predictions.
Args
ax
:matplotlib.axes.Axes
, optional- Draw to this axes, otherwise draw to the current axes.
Returns
matplotlib.axes.Axes
Expand source code Browse git
def plot(self, ax=None, plot_legend=False): """ Plot the data including removed observations, latent function, and predictions. Args: ax (matplotlib.axes.Axes, optional): Draw to this axes, otherwise draw to the current axes. Returns: matplotlib.axes.Axes """ # TODO: ability to plot conditional or marginal distribution to reduce input dims if self.get_input_dims() > 2: raise Exception("cannot plot more than two input dimensions") if self.get_input_dims() == 2: raise Exception("two dimensional input data not yet implemented") # TODO if ax == None: ax = plt.gca() X = self.X_offsets + (self.X/self.X_scales) X_pred = self.X_offsets + (self.X_pred/self.X_scales) legend = [] colors = list(matplotlib.colors.TABLEAU_COLORS) for i, name in enumerate(self.Y_mu_pred): if self.Y_mu_pred[name].size != 0: lower = self.Y_mu_pred[name] - self.Y_var_pred[name] upper = self.Y_mu_pred[name] + self.Y_var_pred[name] ax.plot(X_pred[:,0], self.Y_mu_pred[name], ls='-', color=colors[i], lw=2) ax.fill_between(X_pred[:,0], lower, upper, color=colors[i], alpha=0.1) ax.plot(X_pred[:,0], lower, ls='-', color=colors[i], lw=1, alpha=0.5) ax.plot(X_pred[:,0], upper, ls='-', color=colors[i], lw=1, alpha=0.5) label = 'Prediction' if 1 < len(self.Y_mu_pred): label += ' ' + name legend.append(plt.Line2D([0], [0], ls='-', color=colors[i], lw=2, label=label)) if self.F != None: n = len(X[:,0])*10 x_min = np.min(X[:,0]) x_max = np.max(X[:,0]) if len(X_pred) != 0: x_min = min(x_min, np.min(X_pred)) x_max = max(x_max, np.max(X_pred)) x = np.empty((n, 1)) x[:,0] = np.linspace(x_min, x_max, n) y = self.F(x) ax.plot(x[:,0], y, 'r--', lw=1) legend.append(plt.Line2D([0], [0], ls='--', color='r', label='True')) ax.plot(X[:,0], self.Y, 'k--', alpha=0.8) legend.append(plt.Line2D([0], [0], ls='--', color='k', label='All Points')) if self.has_test_data(): x, y = X[self.mask,:], self.Y[self.mask] ax.plot(x[:,0], y, 'k.', mew=1, ms=13, markeredgecolor='white') legend.append(plt.Line2D([0], [0], ls='', marker='.', color='k', mew=2, ms=10, label='Training Points')) for idx in np.where(~self.mask)[0]: width_1 = (self.X[min(idx, len(X) - 1), 0] - self.X[max(idx - 1, 0), 0]) / 2 width_2 = (self.X[min(idx + 1, len(X) - 1), 0] - self.X[max(idx, 0), 0]) / 2 ax.add_patch( patches.Rectangle( (self.X[max(idx - 1, 0), 0] + width_1, ax.get_ylim()[0]), # (x,y) width_1 + width_2, # width ax.get_ylim()[1] - ax.get_ylim()[0], # height fill=True, color='xkcd:strawberry', alpha=0.25, lw=0, )) legend.append(patches.Rectangle( (1, 1), 1, 1, fill=True, color='xkcd:strawberry', alpha=0.5, lw=0, label='Removed Points' )) xmin = X.min() xmax = X.max() ax.set_xlim(xmin - (xmax - xmin)*0.001, xmax + (xmax - xmin)*0.001) ax.set_xlabel(self.X_labels[0]) ax.set_ylabel(self.Y_label) ax.set_title(self.name) formatter = matplotlib.ticker.FuncFormatter(lambda x,pos: self.formatters[0].format(x)) ax.xaxis.set_major_formatter(formatter) if (len(legend) > 0) and plot_legend: ax.legend(handles=legend, loc='upper center', ncol=len(legend), bbox_to_anchor=(0.5, 1.7)) return ax
def plot_spectrum(self, method='lombscargle', ax=None, per=None, maxfreq=None)
-
Plot the spectrum of the data.
Args
method
:str
, optional- Set the method to get the spectrum such as 'lombscargle'.
ax
:matplotlib.axes.Axes
, optional- Draw to this axes, otherwise draw to the current axes.
per
:float
,str
- Set the scale of the X axis depending on the formatter used, eg. per=5 or per='3d' for three days.
maxfreq
:float
, optional- Maximum frequency to plot, otherwise the Nyquist frequency is used.
Returns
matplotlib.axes.Axes
Expand source code Browse git
def plot_spectrum(self, method='lombscargle', ax=None, per=None, maxfreq=None): """ Plot the spectrum of the data. Args: method (str, optional): Set the method to get the spectrum such as 'lombscargle'. ax (matplotlib.axes.Axes, optional): Draw to this axes, otherwise draw to the current axes. per (float, str): Set the scale of the X axis depending on the formatter used, eg. per=5 or per='3d' for three days. maxfreq (float, optional): Maximum frequency to plot, otherwise the Nyquist frequency is used. Returns: matplotlib.axes.Axes """ # TODO: ability to plot conditional or marginal distribution to reduce input dims if self.get_input_dims() > 2: raise Exception("cannot plot more than two input dimensions") if self.get_input_dims() == 2: raise Exception("two dimensional input data not yet implemented") # TODO if ax == None: ax = plt.gca() ax.set_title(self.name, fontsize=36) formatter = self.formatters[0] factor, name = formatter.get_scale(per) if name != None: ax.set_xlabel('Frequency (1/'+name+')') else: ax.set_xlabel('Frequency [Hz]') X_space = np.squeeze((self.X_offsets + (self.X/self.X_scales)) / factor) freq = maxfreq if freq == None: dist = np.abs(X_space[1:]-X_space[:-1]) freq = 1/np.average(dist) X = np.linspace(0.0, freq, 10001)[1:] Y_err = [] if method == 'lombscargle': Y = signal.lombscargle(X_space, self.Y, X) elif method == 'bnse': # TODO: check if outcome is correct nyquist = self.get_nyquist_estimation() bnse = bse(X_space, self.Y) bnse.set_freqspace(freq/2.0/np.pi, 10001) bnse.train() bnse.compute_moments() Y = bnse.post_mean_r**2 + bnse.post_mean_i**2 Y_err = 2 * np.sqrt(np.diag(bnse.post_cov_r**2 + bnse.post_cov_i**2)) Y = Y[1:] Y_err = Y_err[1:] else: raise ValueError('periodogram method "%s" does not exist' % (method)) ax.plot(X, Y, '-', color='xkcd:strawberry', lw=2.3) if len(Y_err) != 0: ax.fill_between(X, Y-Y_err, Y+Y_err, alpha=0.4) ax.set_title(self.name + ' Spectrum') xmin = X.min() xmax = X.max() ax.set_xlim(xmin - (xmax - xmin)*0.005, xmax + (xmax - xmin)*0.005) ax.set_yticks([]) ax.set_ylim(0, None) return ax
def remove_random_ranges(self, n, duration)
-
Removes a number of ranges to simulate sensor failure.
Args
n
:int
- Number of ranges to remove.
duration
:float
,str
- Width of ranges to remove, can use a number or the duration format syntax (see aggregate()).
Examples
>>> data.remove_random_ranges(2, 5) # remove two ranges that are 5 wide in input space >>> data.remove_random_ranges(3, '1d') # remove three ranges that are 1 day wide
Expand source code Browse git
def remove_random_ranges(self, n, duration): """ Removes a number of ranges to simulate sensor failure. Args: n (int): Number of ranges to remove. duration (float, str): Width of ranges to remove, can use a number or the duration format syntax (see aggregate()). Examples: >>> data.remove_random_ranges(2, 5) # remove two ranges that are 5 wide in input space >>> data.remove_random_ranges(3, '1d') # remove three ranges that are 1 day wide """ if self.get_input_dims() != 1: raise Exception("can only remove ranges on one dimensional input data") duration = self.formatters[0].parse_delta(duration) if n < 1: return m = (self.X[-1]-self.X[0]) - n*duration if m <= 0: raise Exception("no data left after removing ranges") locs = self.X[:,0] <= self.X[-1,0]-duration locs[sum(locs)] = True # make sure the last data point can be deleted for i in range(n): x = self.X[locs][np.random.randint(len(self.X[locs]))] locs[(self.X[:,0] > x-duration) & (self.X[:,0] < x+duration)] = False self.mask[(self.X[:,0] >= x) & (self.X[:,0] < x+duration)] = False
def remove_randomly(self, n=None, pct=None)
-
Removes observations randomly on the whole range. Either 'n' observations are removed, or a percentage of the observations.
Args
n
:int
, optional- Number of observations to remove randomly.
pct
:float
, optional- Percentage in interval [0,1] of observations to remove randomly.
Examples
>>> data.remove_randomly(50) # remove 50 observations >>> data.remove_randomly(pct=0.9) # remove 90% of the observations
Expand source code Browse git
def remove_randomly(self, n=None, pct=None): """ Removes observations randomly on the whole range. Either 'n' observations are removed, or a percentage of the observations. Args: n (int, optional): Number of observations to remove randomly. pct (float, optional): Percentage in interval [0,1] of observations to remove randomly. Examples: >>> data.remove_randomly(50) # remove 50 observations >>> data.remove_randomly(pct=0.9) # remove 90% of the observations """ if n == None: if pct == None: n = 0 else: n = int(pct * self.X.shape[0]) idx = np.random.choice(self.X.shape[0], n, replace=False) self.mask[idx] = False
def remove_range(self, start=None, end=None)
-
Removes observations in the interval [start,end]. Start and end can be strings if a proper formatter is set for the independent variable.
Args
start
:float
,str
, optional- Start of interval. Defaults to first value in observations.
end
:float
,str
, optional- End of interval. Defaults to last value in observations.
Examples
>>> data = mogptk.LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave') >>> data.remove_range(3, 8) >>> data = mogptk.LoadCSV('gold.csv', 'Date', 'Price', format={'Date': mogptk.FormatDate}) >>> data.remove_range('2016-01-15', '2016-06-15')
Expand source code Browse git
def remove_range(self, start=None, end=None): """ Removes observations in the interval [start,end]. Start and end can be strings if a proper formatter is set for the independent variable. Args: start (float, str, optional): Start of interval. Defaults to first value in observations. end (float, str, optional): End of interval. Defaults to last value in observations. Examples: >>> data = mogptk.LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave') >>> data.remove_range(3, 8) >>> data = mogptk.LoadCSV('gold.csv', 'Date', 'Price', format={'Date': mogptk.FormatDate}) >>> data.remove_range('2016-01-15', '2016-06-15') """ if self.get_input_dims() != 1: raise Exception("can only remove ranges on one dimensional input data") if start == None: start = np.min(self.X[:,0]) else: start = self.X_scales[0] * (self.formatters[0].parse(start) - self.X_offsets[0]) if end == None: end = np.max(self.X[:,0]) else: end = self.X_scales[0] * (self.formatters[0].parse(end) - self.X_offsets[0]) idx = np.where(np.logical_and(self.X[:,0] >= start, self.X[:,0] <= end)) self.mask[idx] = False
def remove_relative_range(self, start=None, end=None)
-
Removes observations between start and end as a percentage of the number of observations. So '0' is the first observation, '0.5' is the middle observation, and '1' is the last observation.
Args
start
:float
- Start percentage in interval [0,1].
end
:float
- End percentage in interval [0,1].
Expand source code Browse git
def remove_relative_range(self, start=None, end=None): """ Removes observations between start and end as a percentage of the number of observations. So '0' is the first observation, '0.5' is the middle observation, and '1' is the last observation. Args: start (float): Start percentage in interval [0,1]. end (float): End percentage in interval [0,1]. """ if self.get_input_dims() != 1: raise Exception("can only remove ranges on one dimensional input data") if end is None: end = 1 if start is None: start = 0 x_min = np.min(self.X[:,0]) x_max = np.max(self.X[:,0]) start = x_min + max(0.0, min(1.0, start)) * (x_max-x_min) end = x_min + max(0.0, min(1.0, end)) * (x_max-x_min) idx = np.where(np.logical_and(self.X[:,0] >= start, self.X[:,0] <= end)) self.mask[idx] = False
def set_function(self, f)
-
Set a (latent) function for the data, ie. the theoretical or true signal. This is used for plotting purposes and is optional.
The function should take one argument x with shape (n,input_dims) and return y with shape (n). If your data has only one input dimension, you can use x[:,0] to select only the first (and only) input dimension.
Args
f
:function
- Function taking x with shape (n,input_dims) and returning shape (n) as y.
Examples
>>> data.set_function(lambda x: np.sin(3*x[:,0])
Expand source code Browse git
def set_function(self, f): """ Set a (latent) function for the data, ie. the theoretical or true signal. This is used for plotting purposes and is optional. The function should take one argument x with shape (n,input_dims) and return y with shape (n). If your data has only one input dimension, you can use x[:,0] to select only the first (and only) input dimension. Args: f (function): Function taking x with shape (n,input_dims) and returning shape (n) as y. Examples: >>> data.set_function(lambda x: np.sin(3*x[:,0]) """ _check_function(f, self.get_input_dims()) self.F = f
def set_labels(self, x_labels, y_label)
-
Set axes labels for plots.
Args
x_labels
:str
,list
ofstr
- X data names for each input dimension.
y_label
:str
- Y data name for output dimension.
Examples
>>> data.set_labels(['X', 'Y'], 'Cd')
Expand source code Browse git
def set_labels(self, x_labels, y_label): """ Set axes labels for plots. Args: x_labels (str, list of str): X data names for each input dimension. y_label (str): Y data name for output dimension. Examples: >>> data.set_labels(['X', 'Y'], 'Cd') """ if isinstance(x_labels, str): x_labels = [x_labels] elif not isinstance(x_labels, list) or not all(isinstance(item, str) for item in x_labels): raise ValueError("x_labels must be list of strings") if not isinstance(y_label, str): raise ValueError("y_label must be string") if len(x_labels) != self.get_input_dims(): raise ValueError("x_labels must have the same input dimensions as the data") self.X_labels = x_labels self.Y_label = y_label
def set_name(self, name)
-
Set name for data.
Args
name
:str
- Name of data.
Examples
>>> data.set_name('Channel A')
Expand source code Browse git
def set_name(self, name): """ Set name for data. Args: name (str): Name of data. Examples: >>> data.set_name('Channel A') """ self.name = name
def set_prediction_range(self, start=None, end=None, n=None, step=None)
-
Sets the prediction range.
The interval is set with [start,end], with either 'n' points or a given 'step' between the points. Start and end can be set as strings and step in the duration string format if the proper formatter is set.
Args
start
:float
,str
, optional- Start of interval, defaults to the first observation.
end
:float
,str
, optional- End of interval, defaults to the last observation.
n
:int
, optional- Number of points to generate in the interval.
step
:float
,str
, optional- Spacing between points in the interval.
If neither 'step' or 'n' is passed, default number of points is 100.
Examples
>>> data = mogptk.LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave') >>> data.set_prediction_range(3, 8, 200) >>> data = mogptk.LoadCSV('gold.csv', 'Date', 'Price', formats={'Date': mogptk.FormatDate}) >>> data.set_prediction_range('2016-01-15', '2016-06-15', step='1d')
Expand source code Browse git
def set_prediction_range(self, start=None, end=None, n=None, step=None): """ Sets the prediction range. The interval is set with [start,end], with either 'n' points or a given 'step' between the points. Start and end can be set as strings and step in the duration string format if the proper formatter is set. Args: start (float, str, optional): Start of interval, defaults to the first observation. end (float, str, optional): End of interval, defaults to the last observation. n (int, optional): Number of points to generate in the interval. step (float, str, optional): Spacing between points in the interval. If neither 'step' or 'n' is passed, default number of points is 100. Examples: >>> data = mogptk.LoadFunction(lambda x: np.sin(3*x[:,0]), 0, 10, n=200, var=0.1, name='Sine wave') >>> data.set_prediction_range(3, 8, 200) >>> data = mogptk.LoadCSV('gold.csv', 'Date', 'Price', formats={'Date': mogptk.FormatDate}) >>> data.set_prediction_range('2016-01-15', '2016-06-15', step='1d') """ if self.get_input_dims() != 1: raise Exception("can only set prediction range on one dimensional input data") if start == None: start = self.X[0,:] elif isinstance(start, list): for i in range(self.get_input_dims()): start[i] = self.formatters[i].parse(start[i]) else: start = self.formatters[0].parse(start) if end == None: end = self.X[-1,:] elif isinstance(end, list): for i in range(self.get_input_dims()): end[i] = self.formatters[i].parse(end[i]) else: end = self.formatters[0].parse(end) start = _normalize_input_dims(start, self.get_input_dims()) end = _normalize_input_dims(end, self.get_input_dims()) # TODO: works for multi input dims? if end <= start: raise ValueError("start must be lower than end") # TODO: prediction range for multi input dimension; fix other axes to zero so we can plot? self.X_pred = np.array([]) if step == None and n != None: self.X_pred = np.empty((n, self.get_input_dims())) for i in range(self.get_input_dims()): self.X_pred[:,i] = np.linspace(start[i], end[i], n) else: if self.get_input_dims() != 1: raise ValueError("cannot use step for multi dimensional input, use n") if step == None: step = (end[0]-start[0])/100 else: step = self.formatters[0].parse_delta(step) self.X_pred = np.arange(start[0], end[0]+step, step).reshape(-1, 1) self.X_pred = self.X_scales*(self.X_pred-self.X_offsets)
def set_prediction_x(self, x)
-
Set the prediction range directly.
Args
x
:list
,numpy.ndarray
- Array of shape (n) or (n,input_dims) with input values to predict at.
Examples
>>> data.set_prediction_x([5.0, 5.5, 6.0, 6.5, 7.0])
Expand source code Browse git
def set_prediction_x(self, x): """ Set the prediction range directly. Args: x (list, numpy.ndarray): Array of shape (n) or (n,input_dims) with input values to predict at. Examples: >>> data.set_prediction_x([5.0, 5.5, 6.0, 6.5, 7.0]) """ if isinstance(x, list): x = np.array(x) elif not isinstance(x, np.ndarray): raise ValueError("x expected to be a list or numpy.ndarray") x = x.astype(float) if x.ndim == 1: x = x.reshape(-1, 1) if x.ndim != 2 or x.shape[1] != self.get_input_dims(): raise ValueError("x shape must be (n,input_dims)") self.X_pred = self.X_scales*(x-self.X_offsets) # clear old prediction data now that X_pred has been updated self.Y_mu_pred = {} self.Y_var_pred = {}
def set_x_scaling(self, offsets, scales)
-
Set offset and scaling of X axis for each input dimension.
Args
offsets
:float
,list
ornp.ndarray
offloats
- X offsets per input dimension.
scales
:float
,list
ornp.ndarray
offloats
- X scales per input dimension.
Examples
>>> data.set_x_scaling([['X', 'Y'], 'Cd')
Expand source code Browse git
def set_x_scaling(self, offsets, scales): """ Set offset and scaling of X axis for each input dimension. Args: offsets (float, list or np.ndarray of floats): X offsets per input dimension. scales (float, list or np.ndarray of floats): X scales per input dimension. Examples: >>> data.set_x_scaling([['X', 'Y'], 'Cd') """ if isinstance(offsets, float): offsets = [offsets] elif isinstance(offsets, np.ndarray): offsets = list(offsets) if not isinstance(offsets, list) or not all(isinstance(item, float) for item in offsets) or len(offsets) != self.get_input_dims(): raise ValueError("offsets must be a float, list or np.ndarray of floats and have the same input dimensions as the data") if isinstance(scales, float): scales = [scales] elif isinstance(scales, np.ndarray): scales = list(scales) if not isinstance(scales, list) or not all(isinstance(item, float) for item in scales) or len(scales) != self.get_input_dims(): raise ValueError("scales must be a float, list or np.ndarray of floats and have the same input dimensions as the data") self.X = self.X_offsets + (self.X/self.X_scales) self.X_pred = self.X_offsets + (self.X_pred/self.X_scales) self.X_offsets = offsets self.X_scales = scales self.X = self.X_scales*(self.X-self.X_offsets) self.X_pred = self.X_scales*(self.X_pred-self.X_offsets)
def transform(self, transformer)
-
Transform the data by using one of the provided transformers, such as TransformDetrend, TransformNormalize, TransformLog, …
Args
transformer
:obj
- Transformer object with forward(y, x) and backward(y, x) methods.
Examples
>>> data.transform(mogptk.TransformDetrend)
Expand source code Browse git
def transform(self, transformer): """ Transform the data by using one of the provided transformers, such as TransformDetrend, TransformNormalize, TransformLog, ... Args: transformer (obj): Transformer object with forward(y, x) and backward(y, x) methods. Examples: >>> data.transform(mogptk.TransformDetrend) """ t = transformer if isinstance(t, type): t = transformer() t.set_data(self) self.Y = t.forward(self.Y, self.X) if self.F != None: f = self.F self.F = lambda x: t.forward(f(x), x) self.transformations.append(t)
class FormatBase
-
Expand source code Browse git
class FormatBase: def parse(self, val): raise NotImplementedError def parse_delta(self, val): raise NotImplementedError def format(self, val): raise NotImplementedError def get_scale(self, maxfreq=None): raise NotImplementedError
Subclasses
Methods
def format(self, val)
-
Expand source code Browse git
def format(self, val): raise NotImplementedError
def get_scale(self, maxfreq=None)
-
Expand source code Browse git
def get_scale(self, maxfreq=None): raise NotImplementedError
def parse(self, val)
-
Expand source code Browse git
def parse(self, val): raise NotImplementedError
def parse_delta(self, val)
-
Expand source code Browse git
def parse_delta(self, val): raise NotImplementedError
class FormatDate
-
FormatDate is a formatter that takes date values as input, such as '2019-03-01', and stores values internally as days since 1970-01-01.
Expand source code Browse git
class FormatDate(FormatBase): """ FormatDate is a formatter that takes date values as input, such as '2019-03-01', and stores values internally as days since 1970-01-01. """ def __init__(self): self.category = 'date' def parse(self, val): if isinstance(val, np.datetime64): dt = pd.Timestamp(val).to_pydatetime() else: dt = dateutil.parser.parse(val) return (dt - datetime.datetime(1970,1,1)).total_seconds()/3600/24 def parse_delta(self, val): if isinstance(val, int): return val if isinstance(val, str): return _parse_duration_to_sec(val)/24/3600 raise ValueError("could not convert input to duration") def format(self, val): return datetime.datetime.utcfromtimestamp(val*3600*24).strftime('%Y-%m-%d') def get_scale(self, maxfreq=None): if maxfreq == 'year': return 356.2425, 'year' if maxfreq == 'month': return 30.4369, 'month' if maxfreq == None or maxfreq == 'day': return 1, 'day' if maxfreq == 'hour': return 1/24, 'hour' if maxfreq == 'minute': return 1/24/60, 'minute' if maxfreq == 'second': return 1/24/3600, 'second'
Ancestors
Methods
def format(self, val)
-
Expand source code Browse git
def format(self, val): return datetime.datetime.utcfromtimestamp(val*3600*24).strftime('%Y-%m-%d')
def get_scale(self, maxfreq=None)
-
Expand source code Browse git
def get_scale(self, maxfreq=None): if maxfreq == 'year': return 356.2425, 'year' if maxfreq == 'month': return 30.4369, 'month' if maxfreq == None or maxfreq == 'day': return 1, 'day' if maxfreq == 'hour': return 1/24, 'hour' if maxfreq == 'minute': return 1/24/60, 'minute' if maxfreq == 'second': return 1/24/3600, 'second'
def parse(self, val)
-
Expand source code Browse git
def parse(self, val): if isinstance(val, np.datetime64): dt = pd.Timestamp(val).to_pydatetime() else: dt = dateutil.parser.parse(val) return (dt - datetime.datetime(1970,1,1)).total_seconds()/3600/24
def parse_delta(self, val)
-
Expand source code Browse git
def parse_delta(self, val): if isinstance(val, int): return val if isinstance(val, str): return _parse_duration_to_sec(val)/24/3600 raise ValueError("could not convert input to duration")
class FormatDateTime
-
FormatDateTime is a formatter that takes date and time values as input, such as '2019-03-01 12:30', and stores values internally as seconds since 1970-01-01.
Expand source code Browse git
class FormatDateTime(FormatBase): """ FormatDateTime is a formatter that takes date and time values as input, such as '2019-03-01 12:30', and stores values internally as seconds since 1970-01-01. """ def __init__(self): self.category = 'date' def parse(self, val): if isinstance(val, np.datetime64): dt = pd.Timestamp(val).to_pydatetime() else: dt = dateutil.parser.parse(val) return (dt - datetime.datetime(1970,1,1)).total_seconds() def parse_delta(self, val): if isinstance(val, int): return val if isinstance(val, str): return _parse_duration_to_sec(val) raise ValueError("could not convert input to duration") def format(self, val): return datetime.datetime.utcfromtimestamp(val).strftime('%Y-%m-%d %H:%M') def get_scale(self, maxfreq=None): if maxfreq == 'year': return 3600*24*356.2425, 'year' if maxfreq == 'month': return 3600*24*30.4369, 'month' if maxfreq == 'day': return 3600*24, 'day' if maxfreq == 'hour': return 3600, 'hour' if maxfreq == 'minute': return 60, 'minute' if maxfreq == None or maxfreq == 'second': return 1, 'second'
Ancestors
Methods
def format(self, val)
-
Expand source code Browse git
def format(self, val): return datetime.datetime.utcfromtimestamp(val).strftime('%Y-%m-%d %H:%M')
def get_scale(self, maxfreq=None)
-
Expand source code Browse git
def get_scale(self, maxfreq=None): if maxfreq == 'year': return 3600*24*356.2425, 'year' if maxfreq == 'month': return 3600*24*30.4369, 'month' if maxfreq == 'day': return 3600*24, 'day' if maxfreq == 'hour': return 3600, 'hour' if maxfreq == 'minute': return 60, 'minute' if maxfreq == None or maxfreq == 'second': return 1, 'second'
def parse(self, val)
-
Expand source code Browse git
def parse(self, val): if isinstance(val, np.datetime64): dt = pd.Timestamp(val).to_pydatetime() else: dt = dateutil.parser.parse(val) return (dt - datetime.datetime(1970,1,1)).total_seconds()
def parse_delta(self, val)
-
Expand source code Browse git
def parse_delta(self, val): if isinstance(val, int): return val if isinstance(val, str): return _parse_duration_to_sec(val) raise ValueError("could not convert input to duration")
class FormatNumber
-
FormatNumber is the default formatter and takes regular floating point values as input.
Expand source code Browse git
class FormatNumber(FormatBase): """ FormatNumber is the default formatter and takes regular floating point values as input. """ def __init__(self): self.category = 'num' def parse(self, val): if np.isnan(val): raise ValueError("number cannot be NaN") return float(val) def parse_delta(self, val): return self.parse(val) def format(self, val): return '%.6g' % (val,) def get_scale(self, maxfreq=None): return 1, None
Ancestors
Methods
def format(self, val)
-
Expand source code Browse git
def format(self, val): return '%.6g' % (val,)
def get_scale(self, maxfreq=None)
-
Expand source code Browse git
def get_scale(self, maxfreq=None): return 1, None
def parse(self, val)
-
Expand source code Browse git
def parse(self, val): if np.isnan(val): raise ValueError("number cannot be NaN") return float(val)
def parse_delta(self, val)
-
Expand source code Browse git
def parse_delta(self, val): return self.parse(val)
class TransformBase
-
Expand source code Browse git
class TransformBase: def set_data(self, data): pass def forward(self, y, x=None): raise NotImplementedError def backward(self, y, x=None): raise NotImplementedError
Subclasses
Methods
def backward(self, y, x=None)
-
Expand source code Browse git
def backward(self, y, x=None): raise NotImplementedError
def forward(self, y, x=None)
-
Expand source code Browse git
def forward(self, y, x=None): raise NotImplementedError
def set_data(self, data)
-
Expand source code Browse git
def set_data(self, data): pass
class TransformDetrend (degree=1)
-
TransformDetrend is a transformer that detrends the data. It uses NumPy
polyfit
to find ann
degree polynomial that removes the trend.Args
degree
:int
- Polynomial degree that will be fit, i.e.
2
will find a quadratic trend and remove it from the data.
Expand source code Browse git
class TransformDetrend(TransformBase): """ TransformDetrend is a transformer that detrends the data. It uses NumPy `polyfit` to find an `n` degree polynomial that removes the trend. Args: degree (int): Polynomial degree that will be fit, i.e. `2` will find a quadratic trend and remove it from the data. """ # TODO: add regression? def __init__(self, degree=1): self.degree = degree def set_data(self, data): if data.get_input_dims() != 1: raise Exception("can only remove ranges on one dimensional input data") self.coef = np.polyfit(data.X[data.mask,0], data.Y[data.mask], self.degree) # reg = Ridge(alpha=0.1, fit_intercept=True) # reg.fit(data.X, data.Y) # self.trend = reg def forward(self, y, x=None): return y - np.polyval(self.coef, x[:, 0]) # return y - self.trend.predict(x) def backward(self, y, x=None): return y + np.polyval(self.coef, x[:, 0])
Ancestors
Methods
def backward(self, y, x=None)
-
Expand source code Browse git
def backward(self, y, x=None): return y + np.polyval(self.coef, x[:, 0])
def forward(self, y, x=None)
-
Expand source code Browse git
def forward(self, y, x=None): return y - np.polyval(self.coef, x[:, 0])
def set_data(self, data)
-
Expand source code Browse git
def set_data(self, data): if data.get_input_dims() != 1: raise Exception("can only remove ranges on one dimensional input data") self.coef = np.polyfit(data.X[data.mask,0], data.Y[data.mask], self.degree)
class TransformLinear (scale=1.0, offset=0.0)
-
TransformLinear transforms the data linearly so that y => (y-offset)/scale.
Expand source code Browse git
class TransformLinear(TransformBase): """ TransformLinear transforms the data linearly so that y => (y-offset)/scale. """ def __init__(self, scale=1.0, offset=0.0): self.scale = scale self.offset = offset def set_data(self, data): pass def forward(self, y, x=None): return (y-self.offset)/self.scale def backward(self, y, x=None): return self.scale*y + self.offset
Ancestors
Methods
def backward(self, y, x=None)
-
Expand source code Browse git
def backward(self, y, x=None): return self.scale*y + self.offset
def forward(self, y, x=None)
-
Expand source code Browse git
def forward(self, y, x=None): return (y-self.offset)/self.scale
def set_data(self, data)
-
Expand source code Browse git
def set_data(self, data): pass
class TransformLog
-
TransformLog is a transformer that takes the log of the data. Data is automatically shifted in the y-axis so that all values are greater than or equal to 1.
Expand source code Browse git
class TransformLog(TransformBase): """ TransformLog is a transformer that takes the log of the data. Data is automatically shifted in the y-axis so that all values are greater than or equal to 1. """ def __init__(self): pass def set_data(self, data): self.shift = 1 - data.Y.min() self.mean = np.log(data.Y + self.shift).mean() def forward(self, y, x=None): return np.log(y + self.shift) - self.mean def backward(self, y, x=None): return np.exp(y + self.mean) - self.shift
Ancestors
Methods
def backward(self, y, x=None)
-
Expand source code Browse git
def backward(self, y, x=None): return np.exp(y + self.mean) - self.shift
def forward(self, y, x=None)
-
Expand source code Browse git
def forward(self, y, x=None): return np.log(y + self.shift) - self.mean
def set_data(self, data)
-
Expand source code Browse git
def set_data(self, data): self.shift = 1 - data.Y.min() self.mean = np.log(data.Y + self.shift).mean()
class TransformNormalize
-
TransformNormalize is a transformer that normalizes the data so that the y-axis is between -1 and 1.
Expand source code Browse git
class TransformNormalize(TransformBase): """ TransformNormalize is a transformer that normalizes the data so that the y-axis is between -1 and 1. """ def __init__(self): pass def set_data(self, data): self.ymin = np.amin(data.Y[data.mask]) self.ymax = np.amax(data.Y[data.mask]) def forward(self, y, x=None): return -1.0 + 2.0*(y-self.ymin)/(self.ymax-self.ymin) def backward(self, y, x=None): return (y+1.0)/2.0*(self.ymax-self.ymin)+self.ymin
Ancestors
Methods
def backward(self, y, x=None)
-
Expand source code Browse git
def backward(self, y, x=None): return (y+1.0)/2.0*(self.ymax-self.ymin)+self.ymin
def forward(self, y, x=None)
-
Expand source code Browse git
def forward(self, y, x=None): return -1.0 + 2.0*(y-self.ymin)/(self.ymax-self.ymin)
def set_data(self, data)
-
Expand source code Browse git
def set_data(self, data): self.ymin = np.amin(data.Y[data.mask]) self.ymax = np.amax(data.Y[data.mask])
class TransformWhiten
-
Transform the data so it has mean 0 and variance 1
Expand source code Browse git
class TransformWhiten(TransformBase): """ Transform the data so it has mean 0 and variance 1 """ def __init__(self): pass def set_data(self, data): # take only the non-removed observations self.mean = data.Y[data.mask].mean() self.std = data.Y[data.mask].std() def forward(self, y, x=None): return (y - self.mean) / self.std def backward(self, y, x=None): return (y * self.std) + self.mean
Ancestors
Methods
def backward(self, y, x=None)
-
Expand source code Browse git
def backward(self, y, x=None): return (y * self.std) + self.mean
def forward(self, y, x=None)
-
Expand source code Browse git
def forward(self, y, x=None): return (y - self.mean) / self.std
def set_data(self, data)
-
Expand source code Browse git
def set_data(self, data): # take only the non-removed observations self.mean = data.Y[data.mask].mean() self.std = data.Y[data.mask].std()