--- title: Mixed data keywords: fastai sidebar: home_sidebar summary: "DataLoader than can take data from multiple dataloaders with different types of data" description: "DataLoader than can take data from multiple dataloaders with different types of data" nb_path: "nbs/006_data.mixed.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}

class MixedDataLoader[source]

MixedDataLoader(*loaders, path='.', shuffle=False, device=None, bs=None)

{% endraw %} {% raw %}

class MixedDataLoaders[source]

MixedDataLoaders(*loaders, path='.', device=None) :: DataLoaders

Basic wrapper around several DataLoaders.

{% endraw %} {% raw %}
{% endraw %} {% raw %}

get_mixed_dls[source]

get_mixed_dls(*dls, device=None, shuffle_train=None, shuffle_valid=None, **kwargs)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
from tsai.data.tabular import *

path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
# df['salary'] = np.random.rand(len(df)) # uncomment to simulate a cont dependent variable
target = 'salary'
splits = RandomSplitter()(range_of(df))

cat_names = ['workclass', 'education', 'marital-status']
cont_names = ['age', 'fnlwgt']
dls1 = get_tabular_dls(df, cat_names=cat_names, cont_names=cont_names, y_names=target, splits=splits, bs=512)
dls1.show_batch()

cat_names = None #['occupation', 'relationship', 'race']
cont_names = ['education-num']
dls2 = get_tabular_dls(df, cat_names=cat_names, cont_names=cont_names, y_names=target, splits=splits, bs=128)
dls2.show_batch()
workclass education marital-status age fnlwgt salary
0 Private Some-college Never-married 20.999999 111675.997931 <50k
1 Private 11th Never-married 22.000000 271274.002826 <50k
2 Private Bachelors Never-married 30.000000 347166.005432 >=50k
3 Private HS-grad Married-civ-spouse 49.000000 128132.000308 >=50k
4 Private Some-college Never-married 20.000001 231286.000799 <50k
5 Private HS-grad Separated 52.000000 363404.997839 <50k
6 Self-emp-not-inc Some-college Married-civ-spouse 57.000000 225912.999023 >=50k
7 Private HS-grad Married-civ-spouse 42.000000 39060.005863 >=50k
8 Private Some-college Married-civ-spouse 54.000001 257336.997400 >=50k
9 Self-emp-not-inc Some-college Married-civ-spouse 44.000000 219590.999900 <50k
education-num_na education-num salary
0 False 9.0 <50k
1 False 9.0 <50k
2 False 6.0 <50k
3 False 11.0 <50k
4 False 15.0 <50k
5 False 13.0 <50k
6 False 9.0 <50k
7 False 10.0 <50k
8 False 9.0 <50k
9 False 10.0 <50k
{% endraw %} {% raw %}
dls = get_mixed_dls(dls1, dls2, bs=8)
first(dls.train)
first(dls.valid)
torch.save(dls,'export/mixed_dls.pth')
del dls
dls = torch.load('export/mixed_dls.pth')
dls.train.show_batch()
workclass education marital-status age fnlwgt salary
0 ? Bachelors Never-married 23.000000 123983.002295 <50k
1 Private 7th-8th Never-married 19.000000 104829.997271 <50k
2 State-gov HS-grad Widowed 65.999999 102639.999497 <50k
3 Private Some-college Never-married 22.000000 359758.994678 <50k
4 Private Bachelors Never-married 28.000000 161538.001120 <50k
5 Private HS-grad Married-civ-spouse 40.000000 309311.005654 <50k
6 Private Bachelors Married-civ-spouse 28.000000 130066.997430 >=50k
7 Self-emp-inc Bachelors Married-civ-spouse 38.000000 299035.999132 >=50k
education-num_na education-num salary
0 False 13.0 <50k
1 False 4.0 <50k
2 False 9.0 <50k
3 False 10.0 <50k
4 False 13.0 <50k
5 False 9.0 <50k
6 False 13.0 >=50k
7 False 13.0 >=50k
{% endraw %} {% raw %}
xb, yb = first(dls.train)
xb
((tensor([[ 1, 10,  5],
          [ 5,  6,  5],
          [ 8, 12,  7],
          [ 5, 16,  5],
          [ 5, 10,  5],
          [ 5, 12,  3],
          [ 5, 10,  3],
          [ 6, 10,  3]]),
  tensor([[-1.1356, -0.6232],
          [-1.4290, -0.8034],
          [ 2.0185, -0.8240],
          [-1.2089,  1.5952],
          [-0.7688, -0.2698],
          [ 0.1114,  1.1205],
          [-0.7688, -0.5659],
          [-0.0353,  1.0239]])),
 (tensor([[1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1]]),
  tensor([[ 1.1516],
          [-2.3784],
          [-0.4173],
          [-0.0251],
          [ 1.1516],
          [-0.4173],
          [ 1.1516],
          [ 1.1516]])))
{% endraw %} {% raw %}
xs, ys = first(dls.train)
xs[0][0].shape, xs[0][1].shape, xs[1][0].shape, xs[1][1].shape
(torch.Size([8, 3]),
 torch.Size([8, 2]),
 torch.Size([8, 1]),
 torch.Size([8, 1]))
{% endraw %} {% raw %}
from tsai.data.validation import TimeSplitter
from tsai.data.core import TSRegression, get_ts_dls
X = np.repeat(np.repeat(np.arange(8)[:, None, None], 2, 1), 5, 2).astype(float)
X = np.concatenate([X, X])
y = np.concatenate([np.arange(len(X)//2)]*2)
alphabet = np.array(list(string.ascii_lowercase))
# y = alphabet[y]
splits = TimeSplitter(.5, show_plot=False)(range_of(X))
tfms = [None, TSRegression()]
dls1 = get_ts_dls(X, y, splits=splits, tfms=tfms)
dls1.one_batch()
(TSTensor(samples:8, vars:2, len:5), tensor([0., 1., 2., 3., 4., 5., 6., 7.]))
{% endraw %} {% raw %}
data = np.concatenate([np.repeat(np.arange(8)[:, None], 3, 1)*np.array([1, 10, 100])]*2)
df = pd.DataFrame(data, columns=['cat1', 'cat2', 'cont'])
df['cont'] = df['cont'].astype(float)
df['target'] = y
cat_names = ['cat1', 'cat2']
cont_names = ['cont']
target = 'target'
dls2 = get_tabular_dls(df, procs=[Categorify, FillMissing, #Normalize
                                 ], cat_names=cat_names, cont_names=cont_names, y_names=target, splits=splits, bs=8)
dls2.one_batch()
(tensor([[1, 1],
         [7, 7],
         [5, 5],
         [3, 3],
         [6, 6],
         [8, 8],
         [4, 4],
         [2, 2]]),
 tensor([[  0.],
         [600.],
         [400.],
         [200.],
         [500.],
         [700.],
         [300.],
         [100.]]),
 tensor([[0],
         [6],
         [4],
         [2],
         [5],
         [7],
         [3],
         [1]], dtype=torch.int8))
{% endraw %} {% raw %}
z = zip(_loaders[dls1.train.fake_l.num_workers == 0](dls1.train.fake_l))
for b in z: 
    print(b)
    break
((TSTensor(samples:8, vars:2, len:5), tensor([0., 1., 2., 3., 4., 5., 6., 7.])),)
{% endraw %} {% raw %}
bs = 8
dls = get_mixed_dls(dls1, dls2, bs=bs)
dl = dls.train
xb, yb = dl.one_batch()
test_eq(len(xb), 2)
test_eq(len(xb[0]), bs)
test_eq(len(xb[1]), 2)
test_eq(len(xb[1][0]), bs)
test_eq(len(xb[1][1]), bs)
test_eq(xb[0].data[:, 0, 0].long(), xb[1][0][:, 0] - 1) # categorical data and ts are in synch
test_eq(xb[0].data[:, 0, 0], (xb[1][1]/100).flatten()) # continuous data and ts are in synch
test_eq(tensor(dl.input_idxs), yb.long())
dl = dls.valid
xb, yb = dl.one_batch()
test_eq(tensor(y[dl.input_idxs]), yb.long())
{% endraw %}