# TODO: type hint the returns
# TODO: Issue with Feature.to_dict (2021.09.13 - NS)
# TODO: Feature, TimeSeriesSettings, ProblemDefinition, Json_AI to_dict/to_json outputs don't look correct
# TODO: Why does TimeSeriesSettings have an encode_json flag?
# TODO: Because from_dict intakes "obj", it's incorrectly read in docs
# TODO: DataAnalysis needs in-doc references [NATASHA]
# TODO: df_std_dev is not clear in behavior; this would imply all std. of each column but that is not true, it should \
# be renamed df_std_target_dev
# TODO: How do you specify a custom accuracy function when it's a str? I'm assuming via an import
# TODO: Problem definition missing a few terms
# TODO: Model Analysis
# TODO: Analyzer
from typing import Dict, List, Optional, Union
import sys
if sys.version_info >= (3, 8):
from typing import TypedDict
else:
from typing_extensions import TypedDict
from dataclasses import dataclass
from lightwood.helpers.log import log
from dataclasses_json import dataclass_json
from dataclasses_json.core import _asdict, Json
import json
# See: https://www.python.org/dev/peps/pep-0589/ for how this works
# Not very intuitive but very powerful abstraction, might be useful in other places (@TODO)
[docs]class Module(TypedDict):
"""
Modules are the blocks of code that end up being called from the JSON AI, representing either object instantiations or function calls.
:param module: Name of the module (function or class name)
:param args: Argument to pass to the function or constructor
""" # noqa
module: str
args: Dict[str, str]
[docs]@dataclass
class Feature:
"""
Within a dataframe, each column is considered its own "feature" (unless ignored etc.). \
The following expects each feature to have descriptions of the following:
:param encoder: the methodology for encoding a feature (a Lightwood Encoder)
:param data_dtype: The type of information within this column (ex.: numerical, categorical, etc.)
:param dependency: Any custom attributes for this feature that may require non-standard processing. This highly\
depends on the encoder (ex: Pretrained text may be fine-tuned on the target; time-series requires prior time-steps).
"""
encoder: Module
data_dtype: str = None
dependency: List[str] = None
[docs] @staticmethod
def from_dict(obj: Dict):
"""
Create ``Feature`` objects from the a dictionary representation.
:param obj: A dictionary representation of a column feature's attributes. Must include keys *encoder*, \
*data_dtype*, and *dependency*.
:Example:
>>> my_dict = {"feature_A": {"encoder": MyEncoder, "data_dtype": "categorical", "dependency": None}}
>>> print(Feature.from_dict(my_dict["feature_A"]))
>>> Feature(encoder=None, data_dtype='categorical', dependency=None)
:returns: A Feature object with loaded information.
"""
encoder = obj["encoder"]
data_dtype = obj.get("data_dtype", None)
dependency = obj.get("dependency", None)
feature = Feature(encoder=encoder, data_dtype=data_dtype, dependency=dependency)
return feature
[docs] @staticmethod
def from_json(data: str):
"""
Create ``Feature`` objects from JSON representation. This method calls on :ref: `from_dict` after loading the \
json config.
:param data: A JSON representation of the feature.
:returns: Loaded information into the Feature representation.
"""
return Feature.from_dict(json.loads(data))
[docs] def to_dict(self, encode_json=False) -> Dict[str, Json]:
"""
Converts a Feature to a dictionary representation.
:returns: A python dictionary with strings indicating the three key elements and their respective values of \
the Feature class.
"""
as_dict = _asdict(self, encode_json=encode_json)
for k in list(as_dict.keys()):
if as_dict[k] is None:
del as_dict[k]
return as_dict
[docs] def to_json(self) -> Dict[str, Json]:
"""
Converts a Feature into a JSON object. Calls ``to_dict`` under the hood.
:returns: Json config syntax for the three key elements and their respective values of the Feature class.
"""
return json.dumps(self.to_dict(), indent=4)
[docs]@dataclass_json
@dataclass
class Output:
"""
A representation for the output feature. This is specifically used on the target column of your dataset. \
Four attributes are expected as seen below.
Note, currently supervised tasks are supported, hence categorical, numerical, and time-series are the expected \
outputs types. Complex features such as text generation are not currently available by default.
:param data_dtype: The type of information within the target column (ex.: numerical, categorical, etc.).
:param encoder: the methodology for encoding the target feature (a Lightwood Encoder). There can only be one \
encoder for the output target.
:param mixers: The list of ML algorithms that are trained for the target distribution.
:param ensemble: For a panel of ML algorithms, the approach of selecting the best mixer, and the metrics used in \
that evaluation.
"""
data_dtype: str
encoder: str = None
mixers: List[str] = None
ensemble: str = None
[docs]@dataclass_json
@dataclass
class StatisticalAnalysis:
"""
The Statistical Analysis data class allows users to consider key descriptors of their data using simple \
techniques such as histograms, mean and standard deviation, word count, missing values, and any detected bias\
in the information.
:param nr_rows: Number of rows (samples) in the dataset
:param df_std_dev: The standard deviation of the target of the dataset
:param train_observed_classes:
:param target_class_distribution:
:param histograms:
:param buckets:
:param missing:
:param distinct:
:param bias:
:param avg_words_per_sentence:
:param positive_domain:
"""
nr_rows: int
df_std_dev: Optional[float]
train_observed_classes: object # Union[None, List[str]]
target_class_distribution: object # Dict[str, float]
histograms: object # Dict[str, Dict[str, List[object]]]
buckets: object # Dict[str, Dict[str, List[object]]]
missing: object
distinct: object
bias: object
avg_words_per_sentence: object
positive_domain: bool
[docs]@dataclass_json
@dataclass
class DataAnalysis:
"""
Data Analysis wraps :class: `.StatisticalAnalysis` and :class: `.TypeInformation` together. Further details can be seen in their respective documentation references.
""" # noqa
statistical_analysis: StatisticalAnalysis
type_information: TypeInformation
[docs]@dataclass
class TimeseriesSettings:
"""
For time-series specific problems, more specific treatment of the data is necessary. The following attributes \
enable time-series tasks to be carried out properly.
:param is_timeseries: Whether the input data should be treated as time series; if true, this flag is checked in \
subsequent internal steps to ensure processing is appropriate for time-series data.
:param order_by: A list of columns by which the data should be ordered.
:param group_by: Optional list of columns by which the data should be grouped. Each different combination of values\
for these columns will yield a different series.
:param window: The temporal horizon (number of rows) that a model intakes to "look back" into when making a\
prediction, after the rows are ordered by order_by columns and split into groups if applicable.
:param nr_predictions: The number of points in the future that predictions should be made for, defaults to 1. Once \
trained, the model will be able to predict up to this many points into the future.
:param historical_columns: The temporal dynamics of these columns will be used as additional context to train the \
time series predictor. Note that a non-historical column shall still be used to forecast, but without \
considering their change through time.
:param target_type: Automatically inferred dtype of the target (e.g. `dtype.integer`, `dtype.float`).
:param use_previous_target: Use the previous values of the target column to generate predictions. Defaults to True.
"""
is_timeseries: bool
order_by: List[str] = None
window: int = None
group_by: List[str] = None
use_previous_target: bool = True
nr_predictions: int = None
historical_columns: List[str] = None
target_type: str = (
"" # @TODO: is the current setter (outside of initialization) a sane option?
# @TODO: George: No, I don't think it is, we need to pass this some other way
)
allow_incomplete_history: bool = False
[docs] @staticmethod
def from_dict(obj: Dict):
"""
Creates a TimeseriesSettings object from python dictionary specifications.
:param: obj: A python dictionary with the necessary representation for time-series. The only mandatory columns are ``order_by`` and ``window``.
:returns: A populated ``TimeseriesSettings`` object.
""" # noqa
if len(obj) > 0:
for mandatory_setting in ["order_by", "window"]:
if mandatory_setting not in obj:
err = f"Missing mandatory timeseries setting: {mandatory_setting}"
log.error(err)
raise Exception(err)
timeseries_settings = TimeseriesSettings(
is_timeseries=True,
order_by=obj["order_by"],
window=obj["window"],
use_previous_target=obj.get("use_previous_target", True),
historical_columns=[],
nr_predictions=obj.get("nr_predictions", 1),
allow_incomplete_history=obj.get('allow_incomplete_history', False)
)
for setting in obj:
timeseries_settings.__setattr__(setting, obj[setting])
else:
timeseries_settings = TimeseriesSettings(is_timeseries=False)
return timeseries_settings
[docs] @staticmethod
def from_json(data: str):
"""
Creates a TimeseriesSettings object from JSON specifications via python dictionary.
:param: data: JSON-config file with necessary Time-series specifications
:returns: A populated ``TimeseriesSettings`` object.
"""
return TimeseriesSettings.from_dict(json.loads(data))
[docs] def to_dict(self, encode_json=False) -> Dict[str, Json]:
"""
Creates a dictionary from ``TimeseriesSettings`` object
:returns: A python dictionary containing the ``TimeSeriesSettings`` specifications.
"""
return _asdict(self, encode_json=encode_json)
[docs] def to_json(self) -> Dict[str, Json]:
"""
Creates JSON config from TimeseriesSettings object
:returns: The JSON config syntax containing the ``TimeSeriesSettings`` specifications.
"""
return json.dumps(self.to_dict())
[docs]@dataclass
class ProblemDefinition:
"""
The ``ProblemDefinition`` object indicates details on how the models that predict the target are prepared. \
The only required specification from a user is the ``target``, which indicates the column within the input \
data that the user is trying to predict. Within the ``ProblemDefinition``, the user can specify aspects \
about how long the feature-engineering preparation may take, and nuances about training the models.
:param target: The name of the target column; this is the column that will be used as the goal of the prediction.
:param nfolds: Number of data subsets
:param pct_invalid: Number of data points maximally tolerated as invalid/missing/unknown. \
If the data cleaning process exceeds this number, no subsequent steps will be taken.
:param unbias_target:
:param seconds_per_model: Number of seconds maximum to spend PER model trained in the list of possible mixers.
:param seconds_per_encoder: Number of seconds maximum to spend when training an encoder that requires data to \
learn a representation.
:param time_aim: Time budget (in seconds) to train all needed components for the predictive tasks, including \
encoders and models.
:param target_weights:
:param positive_domain: For numerical taks, force predictor output to be positive (integer or float).
:param timeseries_settings: TimeseriesSettings object for time-series tasks, refer to its documentation for \
available settings.
:param anomaly_detection: Whether to conduct unsupervised anomaly detection; currently supported only for time-\
series.
:param ignore_features: The names of the columns the user wishes to ignore in the ML pipeline. Any column name \
found in this list will be automatically removed from subsequent steps in the ML pipeline.
:param fit_on_validation: Whether to fit the model on the held-out validation data. Validation data is strictly \
used to evaluate how well a model is doing and is NEVER trained. However, in cases where users anticipate new \
incoming data over time, the user may train the model further using the entire dataset.
:param strict_mode:
"""
target: str
pct_invalid: float
unbias_target: bool
seconds_per_mixer: Union[int, None]
seconds_per_encoder: Union[int, None]
time_aim: Union[int, None]
target_weights: Union[List[float], None]
positive_domain: bool
timeseries_settings: TimeseriesSettings
anomaly_detection: bool
ignore_features: List[str]
fit_on_validation: bool
strict_mode: bool
seed_nr: int
[docs] @staticmethod
def from_dict(obj: Dict):
"""
Creates a ProblemDefinition object from a python dictionary with necessary specifications.
:param obj: A python dictionary with the necessary features for the ``ProblemDefinition`` class.
Only requires ``target`` to be specified.
:returns: A populated ``ProblemDefinition`` object.
"""
target = obj['target']
pct_invalid = obj.get('pct_invalid', 2)
unbias_target = obj.get('unbias_target', True)
seconds_per_mixer = obj.get('seconds_per_mixer', None)
seconds_per_encoder = obj.get('seconds_per_encoder', None)
time_aim = obj.get('time_aim', None)
target_weights = obj.get('target_weights', None)
positive_domain = obj.get('positive_domain', False)
timeseries_settings = TimeseriesSettings.from_dict(obj.get('timeseries_settings', {}))
anomaly_detection = obj.get('anomaly_detection', True)
ignore_features = obj.get('ignore_features', [])
fit_on_validation = obj.get('fit_on_validation', True)
strict_mode = obj.get('strict_mode', True)
seed_nr = obj.get('seed_nr', 420)
problem_definition = ProblemDefinition(
target=target,
pct_invalid=pct_invalid,
unbias_target=unbias_target,
seconds_per_mixer=seconds_per_mixer,
seconds_per_encoder=seconds_per_encoder,
time_aim=time_aim,
target_weights=target_weights,
positive_domain=positive_domain,
timeseries_settings=timeseries_settings,
anomaly_detection=anomaly_detection,
ignore_features=ignore_features,
fit_on_validation=fit_on_validation,
strict_mode=strict_mode,
seed_nr=seed_nr
)
return problem_definition
[docs] @staticmethod
def from_json(data: str):
"""
Creates a ProblemDefinition Object from JSON config file.
:param data:
:returns: A populated ProblemDefinition object.
"""
return ProblemDefinition.from_dict(json.loads(data))
[docs] def to_dict(self, encode_json=False) -> Dict[str, Json]:
"""
Creates a python dictionary from the ProblemDefinition object
:returns: A python dictionary
"""
return _asdict(self, encode_json=encode_json)
[docs] def to_json(self) -> Dict[str, Json]:
"""
Creates a JSON config from the ProblemDefinition object
:returns: TODO
"""
return json.dumps(self.to_dict())
[docs]@dataclass
class JsonAI:
"""
The JsonAI Class allows users to construct flexible JSON config to specify their ML pipeline. JSON-AI follows a \
recipe of how to pre-process data, construct features, and train on the target column. To do so, the following \
specifications are required internally.
:param features: The corresponding``Feature`` object for each of the column names of the dataset
:param outputs: The column name of the target and its ``Output`` object
:param problem_definition: The ``ProblemDefinition`` criteria.
:param identifiers: A dictionary of column names and respective data types that are likely identifiers/IDs within the data. Through the default cleaning process, these are ignored.
:param cleaner: The Cleaner object represents the pre-processing step on a dataframe. The user can specify custom subroutines, if they choose, on how to handle preprocessing. Alternatively, "None" suggests Lightwood's default approach in ``data.cleaner``.
:param splitter: The Splitter object is the method in which the input data is split into training/validation/testing data.
:param analyzer: The Analyzer object is used to evaluate how well a model performed on the predictive task.
:param explainer: The Explainer object deploys explainability tools of interest on a model to indicate how well a model generalizes its predictions.
:param analysis_blocks: The blocks that get used in both analysis and inference inside the analyzer and explainer blocks.
:param timeseries_transformer:
:param timeseries_analyzer:
:param accuracy_functions: A list of performance metrics used to evaluate the best mixers.
""" # noqa
features: Dict[str, Feature]
outputs: Dict[str, Output]
problem_definition: ProblemDefinition
identifiers: Dict[str, str]
cleaner: Optional[Module] = None
splitter: Optional[Module] = None
analyzer: Optional[Module] = None
explainer: Optional[Module] = None
analysis_blocks: Optional[List[Module]] = None
timeseries_transformer: Optional[Module] = None
timeseries_analyzer: Optional[Module] = None
accuracy_functions: Optional[List[str]] = None
[docs] @staticmethod
def from_dict(obj: Dict):
"""
Creates a JSON-AI object from dictionary specifications of the JSON-config.
"""
features = {k: Feature.from_dict(v) for k, v in obj["features"].items()}
outputs = {k: Output.from_dict(v) for k, v in obj["outputs"].items()}
problem_definition = ProblemDefinition.from_dict(obj["problem_definition"])
identifiers = obj["identifiers"]
cleaner = obj.get("cleaner", None)
splitter = obj.get("splitter", None)
analyzer = obj.get("analyzer", None)
explainer = obj.get("explainer", None)
analysis_blocks = obj.get("analysis_blocks", None)
timeseries_transformer = obj.get("timeseries_transformer", None)
timeseries_analyzer = obj.get("timeseries_analyzer", None)
accuracy_functions = obj.get("accuracy_functions", None)
json_ai = JsonAI(
features=features,
outputs=outputs,
problem_definition=problem_definition,
identifiers=identifiers,
cleaner=cleaner,
splitter=splitter,
analyzer=analyzer,
explainer=explainer,
analysis_blocks=analysis_blocks,
timeseries_transformer=timeseries_transformer,
timeseries_analyzer=timeseries_analyzer,
accuracy_functions=accuracy_functions,
)
return json_ai
[docs] @staticmethod
def from_json(data: str):
""" Creates a JSON-AI object from JSON config"""
return JsonAI.from_dict(json.loads(data))
[docs] def to_dict(self, encode_json=False) -> Dict[str, Json]:
"""
Creates a python dictionary with necessary modules within the ML pipeline specified from the JSON-AI object.
:returns: A python dictionary that has the necessary components of the ML pipeline for a given dataset.
"""
as_dict = _asdict(self, encode_json=encode_json)
for k in list(as_dict.keys()):
if k == "features":
feature_dict = {}
for name in self.features:
feature_dict[name] = self.features[name].to_dict()
as_dict[k] = feature_dict
if as_dict[k] is None:
del as_dict[k]
return as_dict
[docs] def to_json(self) -> Dict[str, Json]:
"""
Creates JSON config to represent the necessary modules within the ML pipeline specified from the JSON-AI object.
:returns: A JSON config that has the necessary components of the ML pipeline for a given dataset.
"""
return json.dumps(self.to_dict(), indent=4)
[docs]@dataclass_json
@dataclass
class ModelAnalysis:
"""
The ``ModelAnalysis`` class stores useful information to describe a model and understand its predictive performance on a validation dataset.
For each trained ML algorithm, we store:
:param accuracies: Dictionary with obtained values for each accuracy function (specified in JsonAI)
:param accuracy_histogram: Dictionary with histograms of reported accuracy by target value.
:param accuracy_samples: Dictionary with sampled pairs of observed target values and respective predictions.
:param train_sample_size: Size of the training set (data that parameters are updated on)
:param test_sample_size: Size of the testing set (explicitly held out)
:param column_importances: Dictionary with the importance of each column for the model, as estimated by an approach that closely follows a leave-one-covariate-out strategy.
:param confusion_matrix: A confusion matrix for the validation dataset.
:param histograms: Histogram for each dataset feature.
:param dtypes: Inferred data types for each dataset feature.
""" # noqa
accuracies: Dict[str, float]
accuracy_histogram: Dict[str, list]
accuracy_samples: Dict[str, list]
train_sample_size: int
test_sample_size: int
column_importances: Dict[str, float]
confusion_matrix: object
histograms: object
dtypes: object
[docs]@dataclass
class PredictionArguments:
"""
This class contains all possible arguments that can be passed to a Lightwood predictor at inference time.
On each predict call, all arguments included in a parameter dictionary will update the respective fields
in the `PredictionArguments` instance that the predictor will have.
:param predict_proba: triggers (where supported) predictions in raw probability output form. I.e. for classifiers,
instead of returning only the predicted class, the output additionally includes the assigned probability for
each class.
:param all_mixers: forces an ensemble to return predictions emitted by all its internal mixers.
:param fixed_confidence: For analyzer module, specifies a fixed `alpha` confidence for the model calibration so \
that predictions, in average, are correct `alpha` percent of the time.
:param anomaly_error_rate: Error rate for unsupervised anomaly detection. Bounded between 0.01 and 0.99 \
(respectively implies wider and tighter bounds, all other parameters being equal).
:param anomaly_cooldown: Sets the minimum amount of timesteps between consecutive firings of the the anomaly \
detector.
""" # noqa
predict_proba: bool = False
all_mixers: bool = False
fixed_confidence: Union[int, float, None] = None
anomaly_error_rate: Union[float, None] = None
anomaly_cooldown: int = 1
[docs] @staticmethod
def from_dict(obj: Dict):
"""
Creates a ``PredictionArguments`` object from a python dictionary with necessary specifications.
:param obj: A python dictionary with the necessary features for the ``PredictionArguments`` class.
:returns: A populated ``PredictionArguments`` object.
"""
# maybe this should be stateful instead, and save the latest used value for each field?
predict_proba = obj.get('predict_proba', PredictionArguments.predict_proba)
all_mixers = obj.get('all_mixers', PredictionArguments.all_mixers)
fixed_confidence = obj.get('fixed_confidence', PredictionArguments.fixed_confidence)
anomaly_error_rate = obj.get('anomaly_error_rate', PredictionArguments.anomaly_error_rate)
anomaly_cooldown = obj.get('anomaly_cooldown', PredictionArguments.anomaly_cooldown)
pred_args = PredictionArguments(
predict_proba=predict_proba,
all_mixers=all_mixers,
fixed_confidence=fixed_confidence,
anomaly_error_rate=anomaly_error_rate,
anomaly_cooldown=anomaly_cooldown,
)
return pred_args
[docs] def to_dict(self, encode_json=False) -> Dict[str, Json]:
"""
Creates a python dictionary from the ``PredictionArguments`` object
:returns: A python dictionary
"""
return _asdict(self, encode_json=encode_json)