--- title: text.modeling.seq2seq.translation keywords: fastai sidebar: home_sidebar summary: "This module contains custom models, custom splitters, etc... translation tasks." description: "This module contains custom models, custom splitters, etc... translation tasks." nb_path: "nbs/22_text-modeling-seq2seq-translation.ipynb" ---
dataset = load_dataset("wmt16", "de-en", split="train")
dataset = dataset.shuffle(seed=32).select(range(1200))
wmt_df = pd.DataFrame(dataset["translation"], columns=["de", "en"])
len(wmt_df)
wmt_df.head(2)
pretrained_model_name = "Helsinki-NLP/opus-mt-de-en"
model_cls = AutoModelForSeq2SeqLM
hf_arch, hf_config, hf_tokenizer, hf_model = NLP.get_hf_objects(pretrained_model_name, model_cls=model_cls)
hf_arch, type(hf_tokenizer), type(hf_config), type(hf_model)
blocks = (Seq2SeqTextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), noop)
dblock = DataBlock(blocks=blocks, get_x=ColReader("de"), get_y=ColReader("en"), splitter=RandomSplitter())
dls = dblock.dataloaders(wmt_df, bs=2)
b = dls.one_batch()
len(b), b[0]["input_ids"].shape, b[1].shape
dls.show_batch(dataloaders=dls, max_n=2, input_trunc_at=250, target_trunc_at=250)
seq2seq_metrics = {"bleu": {"returns": "bleu"}, "meteor": {"returns": "meteor"}, "sacrebleu": {"returns": "score"}}
model = BaseModelWrapper(hf_model)
learn_cbs = [BaseModelCallback]
fit_cbs = [Seq2SeqMetricsCallback(custom_metrics=seq2seq_metrics)]
learn = Learner(
dls,
model,
opt_func=partial(Adam),
loss_func=PreCalculatedCrossEntropyLoss(), # CrossEntropyLossFlat()
cbs=learn_cbs,
splitter=partial(blurr_seq2seq_splitter, arch=hf_arch),
)
# learn = learn.to_native_fp16() #.to_fp16()
learn.freeze()
learn.summary()
b = dls.one_batch()
preds = learn.model(b[0])
len(preds), preds["loss"].shape, preds["logits"].shape
len(b), len(b[0]), b[0]["input_ids"].shape, len(b[1]), b[1].shape
print(len(learn.opt.param_groups))
learn.lr_find(suggest_funcs=[minimum, steep, valley, slide])
learn.fit_one_cycle(1, lr_max=4e-5, cbs=fit_cbs)
learn.show_results(learner=learn, input_trunc_at=500, target_trunc_at=500)
We add here Learner.blurr_translate
method to bring the results inline with the format returned via Hugging Face's pipeline method
test_de = "Ich trinke gerne Bier"
outputs = learn.blurr_generate(test_de, key="translation_texts", num_return_sequences=3)
outputs
learn.blurr_translate(test_de, num_return_sequences=3)
export_fname = "translation_export"
learn.metrics = None
learn.export(fname=f"{export_fname}.pkl")
inf_learn = load_learner(fname=f"{export_fname}.pkl")
inf_learn.blurr_translate(test_de)
learn = BlearnerForTranslation.from_data(
wmt_df,
"Helsinki-NLP/opus-mt-de-en",
src_lang_name="German",
src_lang_attr="de",
trg_lang_name="English",
trg_lang_attr="en",
dl_kwargs={"bs": 2},
)
metrics_cb = BlearnerForTranslation.get_metrics_cb()
learn.fit_one_cycle(1, lr_max=4e-5, cbs=[metrics_cb])
learn.show_results(learner=learn, input_trunc_at=500, target_trunc_at=250)
test_de = "Ich trinke gerne Bier"
learn.blurr_translate(test_de)
export_fname = "translation_export"
learn.metrics = None
learn = learn.to_fp32()
learn.export(fname=f"{export_fname}.pkl")
inf_learn = load_learner(fname=f"{export_fname}.pkl")
inf_learn.blurr_generate(test_de)
The purpose of the following tests is to ensure as much as possible, that the core training code works for the pretrained translation models below. These tests are excluded from the CI workflow because of how long they would take to run and the amount of data that would be required to download.
Note: Feel free to modify the code below to test whatever pretrained summarization models you are working with ... and if any of your pretrained translation models fail, please submit a github issue (or a PR if you'd like to fix it yourself)
try:
del learn
torch.cuda.empty_cache()
except:
pass
[model_type for model_type in NLP.get_models(task="ConditionalGeneration") if (not model_type.startswith("TF"))]
pretrained_model_names = [
"facebook/bart-base",
"facebook/wmt19-de-en", # FSMT
"Helsinki-NLP/opus-mt-de-en", # MarianMT
#'sshleifer/tiny-mbart',
#'google/mt5-small',
"t5-small",
]
dataset = load_dataset("wmt16", "de-en", split="train")
dataset = dataset.shuffle(seed=32).select(range(1200))
wmt_df = pd.DataFrame(dataset["translation"], columns=["de", "en"])
len(wmt_df)
# hide_output
model_cls = AutoModelForSeq2SeqLM
bsz = 2
inp_seq_sz = 128
trg_seq_sz = 128
test_results = []
for model_name in pretrained_model_names:
error = None
print(f"=== {model_name} ===\n")
hf_tok_kwargs = {}
if model_name == "sshleifer/tiny-mbart":
hf_tok_kwargs["src_lang"], hf_tok_kwargs["tgt_lang"] = "de_DE", "en_XX"
hf_arch, hf_config, hf_tokenizer, hf_model = NLP.get_hf_objects(model_name, model_cls=model_cls, tokenizer_kwargs=hf_tok_kwargs)
print(f"architecture:\t{hf_arch}\ntokenizer:\t{type(hf_tokenizer).__name__}\nmodel:\t\t{type(hf_model).__name__}\n")
# 1. build your DataBlock
text_gen_kwargs = default_text_gen_kwargs(hf_config, hf_model, task="translation")
def add_t5_prefix(inp):
return f"translate German to English: {inp}" if (hf_arch == "t5") else inp
batch_tokenize_tfm = Seq2SeqBatchTokenizeTransform(
hf_arch,
hf_config,
hf_tokenizer,
hf_model,
padding="max_length",
max_length=inp_seq_sz,
max_target_length=trg_seq_sz,
text_gen_kwargs=text_gen_kwargs,
)
blocks = (Seq2SeqTextBlock(batch_tokenize_tfm=batch_tokenize_tfm), noop)
dblock = DataBlock(blocks=blocks, get_x=Pipeline([ColReader("de"), add_t5_prefix]), get_y=ColReader("en"), splitter=RandomSplitter())
dls = dblock.dataloaders(wmt_df, bs=bsz)
b = dls.one_batch()
# 2. build your Learner
seq2seq_metrics = {}
model = BaseModelWrapper(hf_model)
fit_cbs = [ShortEpochCallback(0.05, short_valid=True), Seq2SeqMetricsCallback(custom_metrics=seq2seq_metrics)]
learn = Learner(
dls,
model,
opt_func=ranger,
loss_func=PreCalculatedCrossEntropyLoss(),
cbs=[BaseModelCallback],
splitter=partial(blurr_seq2seq_splitter, arch=hf_arch),
).to_fp16()
learn.create_opt()
learn.freeze()
# 3. Run your tests
try:
print("*** TESTING DataLoaders ***\n")
test_eq(len(b), 2)
test_eq(len(b[0]["input_ids"]), bsz)
test_eq(b[0]["input_ids"].shape, torch.Size([bsz, inp_seq_sz]))
test_eq(len(b[1]), bsz)
# print('*** TESTING One pass through the model ***')
# preds = learn.model(b[0])
# test_eq(preds[1].shape[0], bsz)
# test_eq(preds[1].shape[2], hf_config.vocab_size)
print("*** TESTING Training/Results ***")
learn.fit_one_cycle(1, lr_max=1e-3, cbs=fit_cbs)
test_results.append((hf_arch, type(hf_tokenizer).__name__, type(hf_model).__name__, "PASSED", ""))
learn.show_results(learner=learn, max_n=2, input_trunc_at=500, target_trunc_at=250)
except Exception as err:
test_results.append((hf_arch, type(hf_tokenizer).__name__, type(hf_model).__name__, "FAILED", err))
finally:
# cleanup
del learn
torch.cuda.empty_cache()