--- title: data.text2text.language_modeling keywords: fastai sidebar: home_sidebar summary: "This module contains the bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data for causal and masked language modeling tasks. This includes things like training BERT from scratch or fine-tuning a particular pre-trained LM on your own corpus." description: "This module contains the bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data for causal and masked language modeling tasks. This includes things like training BERT from scratch or fine-tuning a particular pre-trained LM on your own corpus." nb_path: "nbs/01zb_data-text2text-language-modeling.ipynb" ---
torch.cuda.set_device(1)
print(f'Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}')
wiki_path = untar_data(URLs.WIKITEXT_TINY)
wiki_path.ls()
train_df = pd.read_csv(wiki_path/'train.csv', header=None)
valid_df = pd.read_csv(wiki_path/'test.csv', header=None)
print(len(train_df), len(valid_df))
train_df.head()
train_df['is_valid'] = False
valid_df['is_valid'] = True
df = pd.concat([train_df, valid_df])
df.head()
task = HF_TASKS_AUTO.CausalLM
pretrained_model_name = "gpt2"
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name, task=task)
# some tokenizers like gpt and gpt2 do not have a pad token, so we add it here mainly for the purpose
# of setting the "labels" key appropriately (see below)
if (hf_tokenizer.pad_token is None): hf_tokenizer.pad_token = '[PAD]'
hf_tokenizer.pad_token, hf_tokenizer.pad_token_id
Our HF_CausalLMBeforeBatchTransform
allows us to update the input's labels
and our targets appropriately given a causal LM task.
The labels
argument allows you to forgo calculating the loss yourself by letting huggingface return it for you should you choose to do that. Padding tokens are set to -100 by default (e.g., CrossEntropyLossFlat().ignore_index
) and prevent cross entropy loss from considering token prediction for tokens it should ... i.e., the padding tokens. For more information on the meaning of this argument, see the huggingface glossary entry for "Labels"
before_batch_tfm = HF_CausalLMBeforeBatchTransform(hf_arch, hf_tokenizer)
blocks = (HF_Text2TextBlock(before_batch_tfms=before_batch_tfm), noop)
dblock = DataBlock(blocks=blocks, get_x=ColReader(0), splitter=ColSplitter(col='is_valid'))
dls = dblock.dataloaders(df, bs=4)
b = dls.one_batch()
b[0]['input_ids'].shape, b[0]['labels'].shape, b[1].shape
explode_types(b)
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)