Module ktrain.tabular.data
Expand source code
from ..imports import *
from .. import utils as U
from . import preprocessor as pp
def tabular_from_df(train_df, label_columns=[], date_columns=[], val_df=None, val_pct=0.1,
is_regression=False, max_card=20, random_state=None, verbose=1):
train_df = train_df.copy()
# strip space from string columns and check supplied val_df
train_type_dict = pp.clean_df(train_df, val_df=val_df, return_types=True)
# check label_columns
if label_columns is None or (isinstance(label_columns, (list, np.ndarray)) and len(label_columns) == 0):
raise ValueError('label_columns is required')
if isinstance(label_columns, (list, np.ndarray)) and len(label_columns) == 1:
label_columns = label_columns[0]
# define original predictor_columns
predictor_columns = [col for col in train_df.columns.values if col not in label_columns]
# create validation set
if val_df is None:
if val_pct:
df = train_df.copy()
prop = 1-val_pct
if random_state is not None: np.random.seed(42)
msk = np.random.rand(len(df)) < prop
train_df = df[msk]
val_df = df[~msk]
else:
val_df = val_df.copy()
procs = [pp.FillMissing, pp.Categorify, pp.Normalize]
preproc = pp.TabularPreprocessor(predictor_columns, label_columns, date_columns=date_columns,
is_regression=is_regression, procs=procs, max_card=max_card)
trn = preproc.preprocess_train(train_df, verbose=verbose)
if verbose:
integer_cats = []
for col in preproc.cat_names:
if train_type_dict.get(col, None) == 'integer': integer_cats.append(col)
if integer_cats:
print(f'\nThe following integer column(s) are being treated as categorical variables:\n{integer_cats}\n' +\
'To treat any of these column(s) as numerical, cast the column to float in DataFrame or CSV\n and re-run tabular_from* function.\n')
val = None if val_df is None else preproc.preprocess_test(val_df, verbose=verbose)
return (trn, val, preproc)
def tabular_from_csv(train_csv, label_columns=[], date_columns=[], val_csv=None, val_pct=0.1,
index_col=None, is_regression=False, max_card=20, random_state=None):
"""
```
Loads tabular data from CSV file
```
"""
# read in dataset
train_df = pd.read_csv(train_csv, index_col=index_col)
val_df = None
if val_csv is not None:
val_df = pd.read_csv(val_csv, index_col=index_col)
return tabular_from_df(train_df, label_columns=label_columns, date_columns=date_columns, val_df=val_df, val_pct=val_pct,
is_regression=is_regression, max_card=max_card, random_state=random_state)
Functions
def tabular_from_csv(train_csv, label_columns=[], date_columns=[], val_csv=None, val_pct=0.1, index_col=None, is_regression=False, max_card=20, random_state=None)
-
Loads tabular data from CSV file
Expand source code
def tabular_from_csv(train_csv, label_columns=[], date_columns=[], val_csv=None, val_pct=0.1, index_col=None, is_regression=False, max_card=20, random_state=None): """ ``` Loads tabular data from CSV file ``` """ # read in dataset train_df = pd.read_csv(train_csv, index_col=index_col) val_df = None if val_csv is not None: val_df = pd.read_csv(val_csv, index_col=index_col) return tabular_from_df(train_df, label_columns=label_columns, date_columns=date_columns, val_df=val_df, val_pct=val_pct, is_regression=is_regression, max_card=max_card, random_state=random_state)
def tabular_from_df(train_df, label_columns=[], date_columns=[], val_df=None, val_pct=0.1, is_regression=False, max_card=20, random_state=None, verbose=1)
-
Expand source code
def tabular_from_df(train_df, label_columns=[], date_columns=[], val_df=None, val_pct=0.1, is_regression=False, max_card=20, random_state=None, verbose=1): train_df = train_df.copy() # strip space from string columns and check supplied val_df train_type_dict = pp.clean_df(train_df, val_df=val_df, return_types=True) # check label_columns if label_columns is None or (isinstance(label_columns, (list, np.ndarray)) and len(label_columns) == 0): raise ValueError('label_columns is required') if isinstance(label_columns, (list, np.ndarray)) and len(label_columns) == 1: label_columns = label_columns[0] # define original predictor_columns predictor_columns = [col for col in train_df.columns.values if col not in label_columns] # create validation set if val_df is None: if val_pct: df = train_df.copy() prop = 1-val_pct if random_state is not None: np.random.seed(42) msk = np.random.rand(len(df)) < prop train_df = df[msk] val_df = df[~msk] else: val_df = val_df.copy() procs = [pp.FillMissing, pp.Categorify, pp.Normalize] preproc = pp.TabularPreprocessor(predictor_columns, label_columns, date_columns=date_columns, is_regression=is_regression, procs=procs, max_card=max_card) trn = preproc.preprocess_train(train_df, verbose=verbose) if verbose: integer_cats = [] for col in preproc.cat_names: if train_type_dict.get(col, None) == 'integer': integer_cats.append(col) if integer_cats: print(f'\nThe following integer column(s) are being treated as categorical variables:\n{integer_cats}\n' +\ 'To treat any of these column(s) as numerical, cast the column to float in DataFrame or CSV\n and re-run tabular_from* function.\n') val = None if val_df is None else preproc.preprocess_test(val_df, verbose=verbose) return (trn, val, preproc)