Multilingual Sequence Classifaction with the MBart Family
Need to do some multi-lingual sequence classification? Look no further, at least if you want to use MBart and/or the MBart-50 variety of models. Working against the `amazon_reviews_multi` dataset I'll show you how to use the `blurr` library to configure the huggingface objects, build DataLoaders, and train a model that you can use for classifying German text. I'll throw in a bit of the inference code so that you can see how easy `blurr` makes it to use your trained model to boot. Let's go ...
!pip install ohmeow-blurr -q
!pip install datasets -q
from fastai.text.all import *
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification
from blurr.utils import BLURR
from blurr.data.core import *
from blurr.modeling.core import *
from blurr import __version__ as blurr_version
from fastai import __version__ as fa_version
from torch import __version__ as pt_version
from transformers import __version__ as hft_version
print(f'Using blurr {blurr_version}')
print(f'Using pytorch {pt_version}')
print(f'Using fastai {fa_version}')
print(f'Using transformers {hft_version}')
trn_ds, val_ds, tst_ds = load_dataset("amazon_reviews_multi", "de", split=['train', 'validation', 'test'])
trn_df = pd.DataFrame(trn_ds)
val_df = pd.DataFrame(val_ds)
tst_df = pd.DataFrame(tst_ds)
After you got everything, throw the full dataset at it and go get some coffee :)
# this won't work because the rows are ordered by our targets!
# trn_ds, val_ds = load_dataset("amazon_reviews_multi", "de", split=['train[:10%]', 'validation[:10%]'])
trn_df = trn_df.sample(frac=0.05)
val_df = val_df.sample(frac=0.05)
trn_df['is_valid'] = False; val_df['is_valid'] = True
df = pd.concat([trn_df, val_df])
print(len(trn_df), len(val_df), len(df))
trn_df.head()
unique_tgt_vals = trn_df.stars.value_counts()
print(unique_tgt_vals)
labels = sorted(list(df.stars.unique()))
print(labels)
model_name = "facebook/mbart-large-50"
model_cls = AutoModelForSequenceClassification
hf_tok_kwargs = {'src_lang': 'de_DE', 'tgt_lang': 'de_DE'}
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(model_name,
model_cls=model_cls,
tokenizer_kwargs=hf_tok_kwargs,
config_kwargs={'num_labels': len(labels)})
print('arch: ', type(hf_arch))
print('config: ', type(hf_config))
print('tokenizer: ', type(hf_tokenizer))
print('model: ', type(hf_model))
hf_config
blocks = (HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model, max_length=256), CategoryBlock)
dblock = DataBlock(blocks=blocks, get_x=ColReader('review_body'), get_y=ColReader('stars'), splitter=ColSplitter())
dls = dblock.dataloaders(df, bs=4)
shape
of things in your batches (esp. when debugging)
max_length
above to ensure they weren't longer that 128 characters and voila, you have the tutorial before you now.
Of course, you should run this with the biggest batch size and sequence size your GPU(s) will support.
xb, yb = dls.one_batch()
xb['input_ids'].shape
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=1500)
Print out the model so we can build a custom set of parameter groups for an MBart + Sequence Classification task
def mbart_splitter(m):
model = m.hf_model if (hasattr(m, 'hf_model')) else m
embeds_modules = [
model.model.encoder.embed_positions,
model.model.encoder.embed_tokens,
model.model.decoder.embed_positions,
model.model.decoder.embed_tokens
]
embeds = nn.Sequential(*embeds_modules)
groups = L(embeds, model.model.encoder, model.model.decoder, model.classification_head)
return groups.map(params).filter(lambda el: len(el) > 0)
Configure our metrics and callbacks required by blurr
precision = Precision(average='macro')
recall = Recall(average='macro')
f1 = F1Score(average='macro')
learn_metrics = [accuracy, precision, recall, f1]
learn_cbs = [HF_BaseModelCallback]
Configure our Learner
and train away ...
model = HF_BaseModelWrapper(hf_model)
learn = Learner(dls, model, opt_func=Adam, loss_func=CrossEntropyLossFlat(), metrics=learn_metrics, cbs=learn_cbs, splitter=mbart_splitter)
learn.freeze()
print(len(learn.opt.param_groups))
learn.lr_find(suggestions=True)
learn.fit_one_cycle(1, lr_max=7e-5)
learn.show_results(learner=learn, max_n=2, trunc_at=1500)
We'll freeze all the layers with the exception of the decoder and classification_head layers (the last 2)
learn.freeze_to(-2)
learn.lr_find(suggestions=True)
learn.fit_one_cycle(3, lr_max=slice(2e-8, 2e-7))
learn.recorder.plot_loss()
learn.show_results(learner=learn, max_n=2, trunc_at=1500)
txt = tst_df.review_body[0]
print(txt)
learn.blurr_predict(txt)
txts = list(tst_df.review_body.values[1:10])
print(txts)
learn.blurr_predict(txts)
Well that's it!
I hope this article helps your fastai, huggingface, blurr out, and hey, if I'm doing something wrong above please let me know! I'm far from perfect :)
For more information on the MBart/MBar-50 architecture, see the huggingface docs here.