--- title: modeling keywords: fastai sidebar: home_sidebar summary: "This module contains custom models, loss functions, and a default layer group splitter for use in applying discriminiative learning rates to your huggingface models trained via fastai" description: "This module contains custom models, loss functions, and a default layer group splitter for use in applying discriminiative learning rates to your huggingface models trained via fastai" ---
#cuda
torch.cuda.set_device(1)
print(f'Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}')
Note that HF_baseModelWrapper
includes some nifty code for just passing in the things your model needs, as not all transformer architectures require/use the same information.
path = untar_data(URLs.IMDB_SAMPLE)
model_path = Path('models')
imdb_df = pd.read_csv(path/'texts.csv')
imdb_df.head()
task = HF_TASKS_AUTO.ForSequenceClassification
pretrained_model_name = "roberta-base" # "distilbert-base-uncased" "bert-base-uncased"
config = AutoConfig.from_pretrained(pretrained_model_name)
hf_arch, hf_tokenizer, hf_config, hf_model = BLURR_MODEL_HELPER.get_auto_hf_objects(pretrained_model_name,
task=task,
config=config)
# single input
blocks = (
HF_TextBlock.from_df(text_cols_lists=[['text']], hf_arch=hf_arch, hf_tokenizer=hf_tokenizer),
CategoryBlock
)
# can't export lambda functions, so made the getter a standard method
def get_x(x):
return x.text0
dblock = DataBlock(blocks=blocks,
get_x=get_x,
get_y=ColReader('label'),
splitter=ColSplitter(col='is_valid'))
dls = dblock.dataloaders(imdb_df, bs=4)
dls.show_batch(hf_tokenizer=hf_tokenizer, max_n=2)
Using the HF_BaseModelWrapper
approach ...
model = HF_BaseModelWrapper(hf_model)
learn = Learner(dls,
model,
opt_func=partial(Adam, decouple_wd=True),
loss_func=CrossEntropyLossFlat(),
metrics=[accuracy],
splitter=hf_splitter)
learn.create_opt() # -> will create your layer groups based on your "splitter" function
learn.freeze()
.to_fp16()
requires a GPU so had to remove for tests to run on github
Using HF_BaseModelCallback
approach for improved integration with fastai ...
learn = Learner(dls,
hf_model,
opt_func=partial(Adam, decouple_wd=True),
loss_func=CrossEntropyLossFlat(),
metrics=[accuracy],
cbs=[HF_BaseModelCallback],
splitter=hf_splitter)
learn.create_opt() # -> will create your layer groups based on your "splitter" function
learn.freeze()
# learn.summary()
print(len(learn.opt.param_groups))
#slow
learn.lr_find(suggestions=True)
#slow
learn.fit_one_cycle(3, lr_max=1e-3)
And here we creat a @typedispatched impelmentation of Learner.show_results
.
learn.show_results(hf_tokenizer=hf_tokenizer, max_n=2)
learn.predict('I really liked the movie')
learn.unfreeze()
#slow
learn.fit_one_cycle(3, lr_max=slice(1e-6, 1e-3))
learn.recorder.plot_loss()
learn.show_results(hf_tokenizer=hf_tokenizer, max_n=2)
learn.predict("This was a really good movie, ")
learn.predict("Acting was so bad it was almost funny.")
And what about inference?
learn.export(fname='seq_class_learn_export.pkl')
inf_learn = load_learner(fname='seq_class_learn_export.pkl')
inf_learn.predict("This movie should not be seen by anyone!!!!")
And here we provide a custom loss function our question answer task, expanding on some techniques learned from here and here.
In fact, this new loss function can be used in many other multi-modal architectures, with any mix of loss functions. For example, this can be ammended to include the is_impossible
task, as well as the start/end token tasks in the SQUAD v2 dataset.
Again, we'll use a subset of pre-processed SQUAD v2 for our purposes below.
path = Path('./')
squad_df = pd.read_csv(path/'squad_sample.csv'); len(squad_df)
squad_df.head(2)
pretrained_model_name = 'bert-large-uncased-whole-word-masking-finetuned-squad'
hf_tokenizer_cls = BertTokenizer
hf_model_cls = HF_MODELS.BertForQuestionAnswering
hf_arch, hf_tokenizer, hf_config, hf_model = BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name,
hf_tokenizer_cls,
hf_model_cls)
# # here's a pre-trained roberta model for squad you can try too
# pretrained_model_name = "ahotrod/roberta_large_squad2"
# hf_arch, hf_tokenizer, hf_config, hf_model = \
# BLURR_MODEL_HELPER.get_auto_hf_objects(pretrained_model_name, task=HF_TASKS_AUTO.ForQuestionAnswering)
def pre_process_squad(row):
context, qst, ans = row['context'], row['question_text'], row['answer_text']
add_prefix_space = hf_arch in ['gpt2', 'roberta']
if(hf_tokenizer.padding_side == 'right'):
tok_input = hf_tokenizer.convert_ids_to_tokens(hf_tokenizer.encode(qst, context,
add_prefix_space=add_prefix_space))
else:
tok_input = hf_tokenizer.convert_ids_to_tokens(hf_tokenizer.encode(context, qst,
add_prefix_space=add_prefix_space))
tok_ans = hf_tokenizer.tokenize(str(row['answer_text']),
add_special_tokens=False,
add_prefix_space=add_prefix_space)
start_idx, end_idx = 0,0
for idx, tok in enumerate(tok_input):
try:
if (tok == tok_ans[0] and tok_input[idx:idx + len(tok_ans)] == tok_ans):
start_idx, end_idx = idx, idx + len(tok_ans)
break
except: pass
row['tokenized_input'] = tok_input
row['tokenized_input_len'] = len(tok_input)
row['tok_answer_start'] = start_idx
row['tok_answer_end'] = end_idx
return row
squad_df = squad_df.apply(pre_process_squad, axis=1)
max_seq_len= 128
squad_df = squad_df[(squad_df.tokenized_input_len < max_seq_len) & (squad_df.is_impossible == False)]
vocab = list(range(max_seq_len))
# vocab = dict(enumerate(range(max_seq_len)));
# account for tokenizers that pad on right or left side
trunc_strat = 'only_second' if (hf_tokenizer.padding_side == 'right') else 'only_first'
txt_cols = [['question_text'],['context']] if (hf_tokenizer.padding_side == 'right') else [['context'],['question_text']]
# override HF_BatchTransform defaults (optional)
hf_batch_tfm = HF_BatchTransform(hf_arch, hf_tokenizer, task=ForQuestionAnsweringTask(),
max_seq_len=128, truncation_strategy=trunc_strat)
blocks = (
HF_TextBlock.from_df(text_cols_lists=txt_cols,
hf_arch=hf_arch, hf_tokenizer=hf_tokenizer, hf_batch_tfm=hf_batch_tfm),
CategoryBlock(vocab=vocab),
CategoryBlock(vocab=vocab)
)
# can't export lambda functions, so made the getter a standard method
def get_x(x): return (x.text0, x.text1)
dblock = DataBlock(blocks=blocks,
get_x=get_x,
get_y=[ColReader('tok_answer_start'), ColReader('tok_answer_end')],
splitter=RandomSplitter(),
n_inp=1)
dls = dblock.dataloaders(squad_df, bs=4)
len(dls.vocab), dls.vocab[1], dls.vocab[2]
dls.show_batch(hf_tokenizer=hf_tokenizer, max_n=2)
model = HF_QstAndAnsModelWrapper(hf_model)
learn = Learner(dls,
hf_model,
opt_func=partial(Adam, decouple_wd=True),
cbs=[HF_QstAndAnsModelCallback],
splitter=hf_splitter)
learn.loss_func=MultiTargetLoss()
learn.create_opt() # -> will create your layer groups based on your "splitter" function
learn.freeze()
Notice above how I had to define the loss function after creating the Learner
object. I'm not sure why, but the MultiTargetLoss
above prohibits the learner from being exported if I do.
# learn.summary()
print(len(learn.opt.param_groups))
If we are using the callback, like we are here, we have to unpack the HF_baseInput
(the x
returned from dls.one_batch()
list into a tuple (which is what *x
will do) since the learn.model
below is in this case a huggingface transformer model (not a wrapper).
x, y_start, y_end = dls.one_batch()
preds = learn.model(*x)
len(preds),preds[0].shape
#slow
learn.lr_find(suggestions=True)
#slow
learn.fit_one_cycle(3, lr_max=1e-3)
learn.show_results(hf_tokenizer=hf_tokenizer, skip_special_tokens=True, max_n=2)
inf_df = pd.DataFrame.from_dict([{
'text0': 'What did George Lucas make?',
'text1': 'George Lucas created Star Wars in 1977. He directed and produced it.'
}],
orient='columns')
learn.predict(inf_df.iloc[0])
inp_ids = hf_tokenizer.encode('What did George Lucas make?',
'George Lucas created Star Wars in 1977. He directed and produced it.')
hf_tokenizer.convert_ids_to_tokens(inp_ids, skip_special_tokens=False)[11:13]
Note that there is a bug currently in fastai v2 (or with how I'm assembling everything) that currently prevents us from seeing the decoded predictions and probabilities for the "end" token.
inf_df = pd.DataFrame.from_dict([{
'text0': 'When was Star Wars made?',
'text1': 'George Lucas created Star Wars in 1977. He directed and produced it.'
}],
orient='columns')
test_dl = dls.test_dl(inf_df)
inp, probs, _, preds = learn.get_preds(dl=test_dl, with_input=True, with_decoded=True)
hf_tokenizer.convert_ids_to_tokens(inp[0].tolist()[0],
skip_special_tokens=False)[torch.argmax(probs[0]):torch.argmax(probs[1])]
learn.unfreeze()
#slow
learn.fit_one_cycle(3, lr_max=slice(1e-7, 1e-4))
learn.recorder.plot_loss()
learn.show_results(hf_tokenizer=hf_tokenizer, max_n=2)
learn.predict(inf_df.iloc[0])
preds, pred_classes, probs = learn.predict(inf_df.iloc[0])
preds
inp_ids = hf_tokenizer.encode('When was Star Wars made?',
'George Lucas created Star Wars in 1977. He directed and produced it.')
hf_tokenizer.convert_ids_to_tokens(inp_ids, skip_special_tokens=False)[int(preds[0]):int(preds[1])]
And what about inference?
Note that I had to replace the loss function because of the above-mentioned issue to exporting the model with the MultiTargetLoss
loss function. After getting our inference learner, we put it back and we're good to go!
learn.loss_func = nn.CrossEntropyLoss()
learn.export(fname='q_and_a_learn_export.pkl')
inf_learn = load_learner(fname='q_and_a_learn_export.pkl')
inf_learn.loss_func = MultiTargetLoss()
inf_df = pd.DataFrame.from_dict([
{'text0': 'Who created Star Wars?',
'text1': 'George Lucas created Star Wars in 1977. He directed and produced it.'}],
orient='columns')
inf_learn.predict(inf_df.iloc[0])
inp_ids = hf_tokenizer.encode('Who created Star Wars?',
'George Lucas created Star Wars in 1977. He directed and produced it.')
hf_tokenizer.convert_ids_to_tokens(inp_ids, skip_special_tokens=False)[7:9]
The objective of token classification is to predict the correct label for each token provided in the input. In the computer vision world, this is akin to what we do in segmentation tasks whereby we attempt to predict the class/label for each pixel in an image. Named entity recognition (NER) is an example of token classification in the NLP space
# germ_eval_df = pd.read_csv('./data/task-token-classification/germeval2014ner/germeval2014ner_cleaned.csv')
germ_eval_df = pd.read_csv('./germeval2014_sample.csv')
print(len(germ_eval_df))
germ_eval_df.head()
We are only going to be working with small sample from the GermEval 2014 data set ... so the results might not be all that great :)
germ_eval_df.dropna(inplace=True)
germ_eval_df[germ_eval_df.token.isna()]
labels = sorted(germ_eval_df.tag1.unique())
print(labels)
germ_eval_df = germ_eval_df.groupby(by='seq_id').agg(list).reset_index()
germ_eval_df.head(2)
After getting rid of troublesom NaNs and getting my labels, the code above converts my input datafame into something I can model by making the token and tag1 columns represent a list of tokens (or labels) for a give sequence.
task = HF_TASKS_AUTO.ForTokenClassification
pretrained_model_name = "bert-base-multilingual-cased"
config = AutoConfig.from_pretrained(pretrained_model_name)
config.num_labels = len(labels)
Notice above how I set the config.num_labels
attribute to the number of labels we want our model to be able to predict. The model will update its last layer accordingly (this concept is essentially transfer learning).
hf_arch, hf_tokenizer, hf_config, hf_model = BLURR_MODEL_HELPER.get_auto_hf_objects(pretrained_model_name,
task=task,
config=config)
hf_arch, type(hf_tokenizer), type(hf_config), type(hf_model)
test_eq(hf_config.num_labels, len(labels))
# single input
blocks = (
HF_TextBlock.from_df(text_cols_lists=[['token']],
hf_arch=hf_arch,
hf_tokenizer=hf_tokenizer,
tok_func_mode='list',
task=ForTokenClassificationTask(), max_seq_len=128),
HF_TokenCategoryBlock(vocab=labels)
)
def get_y(inp):
return [ (label, len(hf_tokenizer.tokenize(str(entity)))) for entity, label in zip(inp.token, inp.tag1) ]
dblock = DataBlock(blocks=blocks,
get_x=lambda x: x.text0,
get_y=get_y,
splitter=RandomSplitter())
We have to define a get_y
that creates the same number of labels as there are subtokens for a particular token. For example, my name "Wayde" gets split up into two subtokens, "Way" and "##de". The label for "Wayde" is "B-PER" and we just repeat it for the subtokens. This all get cleaned up when we show results and get predictions.
dls = dblock.dataloaders(germ_eval_df, bs=4)
dls.show_batch(hf_tokenizer=hf_tokenizer, max_n=2)
learn = Learner(dls,
hf_model,
opt_func=partial(Adam, decouple_wd=True),
metrics=[accuracy],
cbs=[HF_BaseModelCallback],
splitter=hf_splitter)
learn.create_opt() # -> will create your layer groups based on your "splitter" function
learn.freeze()
# learn.summary()
b = dls.one_batch()
preds = learn.model(*b[0])
len(preds),preds[0].shape
len(b), len(b[0]), b[0][0].shape, len(b[1]), b[1].shape
print(preds[0].view(-1, preds[0].shape[-1]).shape, b[1].view(-1).shape)
test_eq(preds[0].view(-1, preds[0].shape[-1]).shape[0], b[1].view(-1).shape[0])
print(len(learn.opt.param_groups))
#slow
learn.unfreeze()
learn.lr_find(suggestions=True)
#slow
learn.fit_one_cycle(3, lr_max= 3e-5, moms=(0.8,0.7,0.8))
learn.show_results(hf_tokenizer=hf_tokenizer, max_n=20)
res = learn.predict('My name is Wayde and I live in San Diego')
print(res[0])
The default Learner.predict
method returns a prediction per subtoken, including the special tokens for each architecture's tokenizer.
txt ="Hi! My name is Wayde Gilliam from ohmeow.com."
res = learn.predict_tokens(txt)
print([(tok, lbl) for tok,lbl in zip(res[0],res[1])])
It's interesting (and very cool) how well this model performs on English even thought it was trained against a German corpus.