--- title: Multi-label classification keywords: fastai sidebar: home_sidebar summary: "This is an example of how to use blurr for multilabel classification tasks using both the mid and high level Blurr API" description: "This is an example of how to use blurr for multilabel classification tasks using both the mid and high level Blurr API" nb_path: "nbs/99d_text-examples-multilabel.ipynb" ---
Let's start by building our DataBlock
raw_datasets = datasets.load_dataset("civil_comments")
raw_datasets
raw_train_df = raw_datasets["train"].shuffle(seed=42).select(range(10000)).to_pandas()
raw_valid_df = raw_datasets["validation"].shuffle(seed=42).select(range(2000)).to_pandas()
# --- Option 2: Full dataset (using the predefined training and validation sets) ---
# raw_train_df = pd.DataFrame(raw_datasets['train'], columns=list(raw_datasets['train'].features.keys()))
# raw_valid_df = pd.DataFrame(raw_datasets['validation'], columns=list(raw_datasets['validation'].features.keys()))
raw_train_df["is_valid"] = False
raw_valid_df["is_valid"] = True
toxic_df = pd.concat([raw_train_df, raw_valid_df])
print(len(toxic_df))
toxic_df.head()
lbl_cols = ["identity_attack", "insult", "obscene", "toxicity", "severe_toxicity", "sexual_explicit", "threat"]
lbl_cols
toxic_df = toxic_df.round({col: 0 for col in lbl_cols})
toxic_df = toxic_df.convert_dtypes()
toxic_df.head()
For our huggingface model, let's used the distilled version of RoBERTa. This should allow us to train the model on bigger mini-batches without much performance loss. Even on my 1080Ti, I should be able to train all the parameters (which isn't possible with the roberta-base
model)
model_cls = AutoModelForSequenceClassification
pretrained_model_name = "distilroberta-base"
config = AutoConfig.from_pretrained(pretrained_model_name)
config.num_labels = len(lbl_cols)
hf_arch, hf_config, hf_tokenizer, hf_model = NLP.get_hf_objects(pretrained_model_name, model_cls=model_cls, config=config)
hf_model.config.problem_type = "multi_label_classification"
print(hf_arch)
print(type(hf_config))
print(type(hf_tokenizer))
print(type(hf_model))
Note how we have to configure the num_labels
to the number of labels we are predicting. Given that our labels are already encoded, we use a MultiCategoryBlock
with encoded=True and vocab equal to the columns with our 1's and 0's.
blocks = (TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), MultiCategoryBlock(encoded=True, vocab=lbl_cols))
dblock = DataBlock(blocks=blocks, get_x=ColReader("text"), get_y=ColReader(lbl_cols), splitter=ColSplitter())
dls = dblock.dataloaders(toxic_df, bs=4, val_bs=8)
b = dls.one_batch()
len(b), b[0]["input_ids"].shape, b[1].shape
model = BaseModelWrapper(hf_model)
learn = Learner(
dls,
model,
opt_func=partial(Adam),
loss_func=BCEWithLogitsLossFlat(), # PreCalculatedBCELoss()
metrics=[partial(accuracy_multi, thresh=0.2)],
cbs=[BaseModelCallback],
splitter=blurr_splitter,
).to_fp16()
learn.loss_func.thresh = 0.15
learn.freeze()
learn.summary()
preds = model(b[0])
preds.logits.shape, preds
learn.lr_find()
learn.fit_one_cycle(1, lr_max=1e-2)
learn.show_results(learner=learn, max_n=2)
learn.unfreeze()
learn.fit_one_cycle(1, lr_max=slice(1e-8, 1e-4))
learn.show_results(learner=learn, max_n=2)
learn.loss_func.thresh = 0.02
comment = """
Those damned affluent white people should only eat their own food, like cod cakes and boiled potatoes.
No enchiladas for them!
"""
learn.blurr_predict(comment)
preds, targs, losses = learn.get_preds(with_loss=True)
preds.shape, targs.shape, losses.shape
model_cls = AutoModelForSequenceClassification
pretrained_model_name = "distilroberta-base"
config = AutoConfig.from_pretrained(pretrained_model_name)
config.num_labels = len(lbl_cols)
hf_arch, hf_config, hf_tokenizer, hf_model = NLP.get_hf_objects(pretrained_model_name, model_cls=model_cls, config=config)
hf_model.config.problem_type = "multi_label_classification"
print(hf_arch)
print(type(hf_config))
print(type(hf_tokenizer))
print(type(hf_model))
BlearnerForSequenceClassification
learn = BlearnerForSequenceClassification.from_data(
toxic_df, pretrained_model_name, text_attr="text", label_attr=lbl_cols, dl_kwargs={"bs": 4}
)
learn.loss_func.thresh = 0.1
learn.fit_one_cycle(1, lr_max=1e-2)
learn.show_results(learner=learn, max_n=2)
learn.loss_func.thresh = 0.01
comment = """
Those damned affluent white people should only eat their own food, like cod cakes and boiled potatoes.
No enchiladas for them!
"""
learn.predict(comment)
preds, targs = learn.get_preds(with_loss=False)
preds.shape, targs.shape
model_cls = AutoModelForSequenceClassification
pretrained_model_name = "distilroberta-base"
config = AutoConfig.from_pretrained(pretrained_model_name)
config.num_labels = len(lbl_cols)
hf_arch, hf_config, hf_tokenizer, hf_model = NLP.get_hf_objects(pretrained_model_name, model_cls=model_cls, config=config)
hf_model.config.problem_type = "multi_label_classification"
print(hf_arch)
print(type(hf_config))
print(type(hf_tokenizer))
print(type(hf_model))
raw_datasets = datasets.load_dataset("civil_comments", split=["train[:1000]", "validation[:500]"])
raw_datasets
def tokenize_function(example):
inputs = hf_tokenizer(example["text"], truncation=True)
targets = [
float(round(example[lbl]))
for lbl in ["identity_attack", "insult", "obscene", "severe_toxicity", "sexual_explicit", "threat", "toxicity"]
]
return {**inputs, **{"labels": targets}}
tokenized_datasets = [ds.map(tokenize_function, batched=False) for ds in raw_datasets]
label_names = ["identity_attack", "insult", "obscene", "severe_toxicity", "sexual_explicit", "threat", "toxicity"]
trn_dl = TextDataLoader(
tokenized_datasets[0],
hf_arch=hf_arch,
hf_config=hf_config,
hf_tokenizer=hf_tokenizer,
hf_model=hf_model,
preproccesing_func=preproc_hf_dataset,
batch_decode_kwargs={"labels": label_names},
shuffle=True,
batch_size=8,
)
val_dl = TextDataLoader(
tokenized_datasets[1],
hf_arch=hf_arch,
hf_config=hf_config,
hf_tokenizer=hf_tokenizer,
hf_model=hf_model,
preproccesing_func=preproc_hf_dataset,
batch_decode_kwargs={"labels": label_names},
batch_size=16,
)
dls = DataLoaders(trn_dl, val_dl)
b = dls.one_batch()
b[0]["input_ids"].shape, b[1].shape
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)
model = BaseModelWrapper(hf_model)
learn = Learner(
dls,
model,
opt_func=partial(Adam),
loss_func=PreCalculatedBCELoss(),
metrics=[partial(accuracy_multi, thresh=0.2)],
cbs=[BaseModelCallback],
splitter=blurr_splitter,
).to_fp16()
learn.loss_func.thresh = 0.1
learn.freeze()
learn.fit(1, 1e-2)
learn.show_results(learner=learn, max_n=2)
If your sequence classification model isn't training, make sure you have set the num_labels
correctly (95% of the time this is the culprit). And with this example, you can see that Blurr can make both your multiclassification and multilabel classification tasks a breeze.