--- title: text.modeling.question_answering keywords: fastai sidebar: home_sidebar summary: "This module contains custom models, loss functions, custom splitters, etc... for question answering tasks" description: "This module contains custom models, loss functions, custom splitters, etc... for question answering tasks" nb_path: "nbs/14_text-modeling-question-answering.ipynb" ---
raw_datasets = load_dataset("squad", split=["train[:1000]", "validation[:200]"])
raw_train_df = pd.DataFrame(raw_datasets[0])
raw_valid_df = pd.DataFrame(raw_datasets[1])
raw_train_df["is_valid"] = False
raw_valid_df["is_valid"] = True
squad_df = pd.concat([raw_train_df, raw_valid_df])
squad_df["ans_start_char_idx"] = squad_df.answers.apply(lambda v: v["answer_start"][0] if len(v["answer_start"]) > 0 else "0")
squad_df["answer_text"] = squad_df.answers.apply(lambda v: v["text"][0] if len(v["text"]) > 0 else "")
squad_df["ans_end_char_idx"] = squad_df["ans_start_char_idx"].astype(int) + squad_df["answer_text"].str.len()
print(len(squad_df))
model_cls = AutoModelForQuestionAnswering
pretrained_model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
hf_arch, hf_config, hf_tokenizer, hf_model = NLP.get_hf_objects(pretrained_model_name, model_cls=model_cls)
max_seq_len = 128
vocab = dict(enumerate(range(max_seq_len)))
preprocessor = QAPreprocessor(
hf_tokenizer, id_attr="id", tok_kwargs={"return_overflowing_tokens": True, "max_length": max_seq_len, "stride": 64}
)
proc_df = preprocessor.process_df(squad_df)
proc_df.head(1)
before_batch_tfm = QABatchTokenizeTransform(hf_arch, hf_config, hf_tokenizer, hf_model, max_length=max_seq_len)
blocks = (
TextBlock(batch_tokenize_tfm=before_batch_tfm, input_return_type=QATextInput),
CategoryBlock(vocab=vocab),
CategoryBlock(vocab=vocab),
)
# since its preprocessed, we include an "text" key with the values of our question and context
def get_x(item):
return {"text": (item.proc_question, item.proc_context), "id": item.id}
dblock = DataBlock(
blocks=blocks,
get_x=get_x,
get_y=[ItemGetter("ans_start_token_idx"), ItemGetter("ans_end_token_idx")],
splitter=ColSplitter(),
n_inp=1,
)
dls = dblock.dataloaders(proc_df, bs=4)
len(dls.vocab), dls.vocab[0], dls.vocab[1]
dls.valid.show_batch(dataloaders=dls, max_n=2)
Here we create a question/answer specific subclass of BaseModelCallback
in order to get all the start and end prediction.
Hugging Face question answering models will calculate the loss for you when you include both the start_positions
and end_positions
in the inputs dictionary. This is done by the QABatchTokenizeTransform
when include_labels
= True (which is the default). This also requires fastai developers to set their Learner
's loss function to the PreCalculatedQALoss
for training to work properly.
Notice below how I had to define the loss function after creating the Learner
object. I'm not sure why, but the MultiTargetLoss
above prohibits the learner from being exported if I do.
model = BaseModelWrapper(hf_model)
learn_cbs = [QAModelCallback]
validation_ds = proc_df[proc_df.is_valid == True].to_dict(orient="records")
fit_cbs = [QAMetricsCallback(compute_metrics_func=compute_qa_metrics, validation_ds=validation_ds)]
learn = Learner(dls, model, opt_func=partial(Adam), cbs=learn_cbs, splitter=blurr_splitter)
learn.loss_func = PreCalculatedQALoss() # MultiTargetLoss()
learn.create_opt() # -> will create your layer groups based on your "splitter" function
learn.freeze()
# learn.summary() # no glory here :(
print(len(learn.opt.param_groups))
learn.lr_find(suggest_funcs=[minimum, steep, valley, slide])
learn.fit_one_cycle(1, lr_max=1e-3, cbs=fit_cbs)
learn.show_results(learner=learn, skip_special_tokens=True, max_n=4, trunc_at=500)
learn.unfreeze()
learn.fit_one_cycle(1, lr_max=slice(1e-9, 1e-7), cbs=fit_cbs)
learn.recorder.plot_loss()
learn.show_results(learner=learn, max_n=2, trunc_at=100)
context = "George Lucas created Star Wars in 1977. He directed and produced it."
learn.blurr_predict_answers({"question": "What did George Lucas make?", "context": context})
context = "George Lucas created Star Wars in 1977. He directed and produced it."
learn.blurr_predict_answers(
[
{"question": "What did George Lucas make?", "context": context},
{"question": "What year did Star Wars come out?", "context": context},
{"question": "What did George Lucas do?", "context": context},
{"question": "Who plays Spock in the movie?", "context": context},
]
)
Note that I had to replace the loss function because of the above-mentioned issue to exporting the model with the MultiTargetLoss
loss function. After getting our inference learner, we put it back and we're good to go!
export_name = "q_and_a_learn_export"
learn.loss_func = CrossEntropyLossFlat()
learn.export(fname=f"{export_name}.pkl")
inf_learn = load_learner(fname=f"{export_name}.pkl")
inf_learn.loss_func = MultiTargetLoss()
context = "George Lucas created Star Wars in 1977. He directed and produced it."
inf_learn.blurr_predict_answers(
[
{"question": "What did George Lucas make?", "context": context},
{"question": "What year did Star Wars come out?", "context": context},
{"question": "What did George Lucas do?", "context": context},
{"question": "Who plays Spock in the movie?", "context": context},
]
)
BLearnerForQuestionAnswering
requires a question, context (within which to find the answer to the question), and the start/end indices of where the answer lies in the tokenized context. Because those indices vary by tokenizer, we can pass a preprocess_func
that will take our raw data, perform any preprocessing we want, and return it in a way that will work for extractive QA.
raw_datasets = load_dataset("squad", split=["train[:1000]", "validation[:200]"])
raw_train_df = pd.DataFrame(raw_datasets[0])
raw_valid_df = pd.DataFrame(raw_datasets[1])
raw_train_df["is_valid"] = False
raw_valid_df["is_valid"] = True
# concatenate into a single DataFrame
squad_df = pd.concat([raw_train_df, raw_valid_df])
# include the required start/end character indicies and full text of the answer
squad_df["ans_start_char_idx"] = squad_df.answers.apply(lambda v: v["answer_start"][0] if len(v["answer_start"]) > 0 else "0")
squad_df["answer_text"] = squad_df.answers.apply(lambda v: v["text"][0] if len(v["text"]) > 0 else "")
squad_df["ans_end_char_idx"] = squad_df["ans_start_char_idx"].astype(int) + squad_df["answer_text"].str.len()
# run our modified DataFrame thru the QAPreprocessor to get the start/end "token" indices we want to predict
preprocessor = QAPreprocessor(
hf_tokenizer, id_attr="id", tok_kwargs={"return_overflowing_tokens": True, "max_length": max_seq_len, "stride": 64}
)
proc_df = preprocessor.process_df(squad_df)
proc_df.head(1)
Blearner
pretrained_model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
learn = BlearnerForQuestionAnswering.from_data(
proc_df,
pretrained_model_name,
id_attr="id",
question_attr="proc_question",
context_attr="proc_context",
max_seq_len=128,
dl_kwargs={"bs": 4},
).to_fp16()
validation_ds = proc_df[proc_df.is_valid == True].to_dict(orient="records")
fit_cbs = [QAMetricsCallback(compute_metrics_func=compute_qa_metrics, validation_ds=validation_ds)]
learn.dls.show_batch(dataloaders=learn.dls, max_n=4, trunc_at=500)
validation_ds = proc_df[proc_df.is_valid == True].to_dict(orient="records")
fit_cbs = [QAMetricsCallback(compute_metrics_func=compute_qa_metrics, validation_ds=validation_ds)]
learn.fit_one_cycle(3, lr_max=1e-3, cbs=fit_cbs)
learn.show_results(learner=learn, skip_special_tokens=True, max_n=2, trunc_at=500)
learn = learn.to_fp32()
learn.loss_func = CrossEntropyLossFlat()
learn.export(fname=f"{export_name}.pkl")
inf_learn = load_learner(fname=f"{export_name}.pkl")
inf_learn.loss_func = MultiTargetLoss()
context = "George Lucas created Star Wars in 1977. He directed and produced it."
inf_learn.blurr_predict_answers(
[
{"question": "What did George Lucas make?", "context": context},
{"question": "What year did Star Wars come out?", "context": context},
{"question": "What did George Lucas do?", "context": context},
{"question": "Who plays Spock in the movie?", "context": context},
]
)