# ******************************************************************************
# Copyright 2017-2019 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import argparse
import io
import os
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from nlp_architect.data.sequential_tagging import TokenClsInputExample, TokenClsProcessor
from nlp_architect.data.utils import write_column_tagged_file
from nlp_architect.models.tagging import NeuralTagger
from nlp_architect.nn.torch.modules.embedders import IDCNN, CNNLSTM
from nlp_architect.nn.torch import setup_backend, set_seed
from nlp_architect.nn.torch.distillation import TeacherStudentDistill
from nlp_architect.procedures.procedure import Procedure
from nlp_architect.procedures.registry import register_train_cmd, register_inference_cmd
from nlp_architect.utils.embedding import get_embedding_matrix, load_embedding_file
from nlp_architect.utils.io import prepare_output_path
from nlp_architect.utils.text import SpacyInstance
from nlp_architect.nn.torch.data.dataset import ParallelDataset
from nlp_architect.models.transformers import TransformerTokenClassifier
[docs]@register_train_cmd(name="tagger", description="Train a neural tagger")
class TrainTagger(Procedure):
[docs] @staticmethod
def add_arguments(parser: argparse.ArgumentParser):
add_parse_args(parser)
[docs] @staticmethod
def run_procedure(args):
do_training(args)
[docs]@register_train_cmd(
name="tagger_kd",
description="Train a neural tagger using Knowledge Distillation"
" and a Transformer teacher model",
)
class TrainTaggerKD(Procedure):
[docs] @staticmethod
def add_arguments(parser: argparse.ArgumentParser):
add_parse_args(parser)
TeacherStudentDistill.add_args(parser)
[docs] @staticmethod
def run_procedure(args):
do_kd_training(args)
[docs]@register_train_cmd(
name="tagger_kd_pseudo",
description="Train a neural tagger using Knowledge Distillation"
" and a Transformer teacher model + pseudo-labeling",
)
class TrainTaggerKDPseudo(Procedure):
[docs] @staticmethod
def add_arguments(parser: argparse.ArgumentParser):
add_parse_args(parser)
TeacherStudentDistill.add_args(parser)
parser.add_argument(
"--labeled_train_file",
default="labeled.txt",
type=str,
help="The file name containing the labeled training examples",
)
parser.add_argument(
"--unlabeled_train_file",
default="unlabeled.txt",
type=str,
help="The file name containing the unlabeled training examples",
)
[docs] @staticmethod
def run_procedure(args):
do_kd_pseudo_training(args)
[docs]@register_inference_cmd(name="tagger", description="Run a neural tagger model")
class RunTagger(Procedure):
[docs] @staticmethod
def add_arguments(parser: argparse.ArgumentParser):
parser.add_argument(
"--data_file",
default=None,
type=str,
required=True,
help="The data file containing data for inference",
)
parser.add_argument(
"--model_dir", type=str, required=True, help="Path to trained model directory"
)
parser.add_argument(
"--output_dir",
type=str,
required=True,
help="Output directory where the model will be saved",
)
parser.add_argument(
"--overwrite_output_dir",
action="store_true",
help="Overwrite the content of the output directory",
)
parser.add_argument(
"--no_cuda", action="store_true", help="Avoid using CUDA when available"
)
parser.add_argument("-b", type=int, default=100, help="Batch size")
[docs] @staticmethod
def run_procedure(args):
do_inference(args)
[docs]def add_parse_args(parser: argparse.ArgumentParser):
parser.add_argument(
"--model_type",
default="cnn-lstm",
type=str,
choices=list(MODEL_TYPE.keys()),
help="model type to use for this tagger",
)
parser.add_argument("--config_file", type=str, help="Embedder model configuration file")
parser.add_argument("-b", type=int, default=10, help="Batch size")
parser.add_argument("-b_ul", type=int, default=10, help="Batch size of unlabeled data")
parser.add_argument("-e", type=int, default=155, help="Number of epochs")
parser.add_argument(
"--data_dir",
default=None,
type=str,
required=True,
help="The input data dir. Should contain dataset files to be parsed by " "the dataloaders.",
)
parser.add_argument(
"--tag_col", type=int, default=-1, help="Entity labels tab number in train/test files"
)
parser.add_argument("--max_sentence_length", type=int, default=50, help="Max sentence length")
parser.add_argument(
"--max_word_length", type=int, default=12, help="Max word length in characters"
)
parser.add_argument(
"--use_crf", action="store_true", help="Use CRF classifier instead of Softmax"
)
parser.add_argument(
"--lr", type=float, default=0.001, help="Learning rate for optimizer (Adam)"
)
parser.add_argument("--embedding_file", help="Path to external word embedding model file")
parser.add_argument(
"--output_dir",
type=str,
required=True,
help="Output directory where the model will be saved",
)
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
parser.add_argument(
"--save_steps", type=int, default=500, help="Save model every X updates steps."
)
parser.add_argument(
"--overwrite_output_dir",
action="store_true",
help="Overwrite the content of the output directory",
)
parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
MODEL_TYPE = {"cnn-lstm": CNNLSTM, "id-cnn": IDCNN}
[docs]def do_training(args):
prepare_output_path(args.output_dir, args.overwrite_output_dir)
device, n_gpus = setup_backend(args.no_cuda)
# Set seed
args.seed = set_seed(args.seed, n_gpus)
# prepare data
processor = TokenClsProcessor(args.data_dir, tag_col=args.tag_col)
train_ex = processor.get_train_examples()
dev_ex = processor.get_dev_examples()
test_ex = processor.get_test_examples()
vocab = processor.get_vocabulary()
vocab_size = len(vocab) + 1
num_labels = len(processor.get_labels()) + 1
# create an embedder
embedder_cls = MODEL_TYPE[args.model_type]
if args.config_file is not None:
embedder_model = embedder_cls.from_config(vocab_size, num_labels, args.config_file)
else:
embedder_model = embedder_cls(vocab_size, num_labels)
# load external word embeddings if present
if args.embedding_file is not None:
emb_dict = load_embedding_file(args.embedding_file)
emb_mat = get_embedding_matrix(emb_dict, vocab)
emb_mat = torch.tensor(emb_mat, dtype=torch.float)
embedder_model.load_embeddings(emb_mat)
classifier = NeuralTagger(
embedder_model,
word_vocab=vocab,
labels=processor.get_labels(),
use_crf=args.use_crf,
device=device,
n_gpus=n_gpus,
)
train_batch_size = args.b * max(1, n_gpus)
train_dataset = classifier.convert_to_tensors(
train_ex, max_seq_length=args.max_sentence_length, max_word_length=args.max_word_length
)
train_sampler = RandomSampler(train_dataset)
train_dl = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size)
if dev_ex is not None:
dev_dataset = classifier.convert_to_tensors(
dev_ex, max_seq_length=args.max_sentence_length, max_word_length=args.max_word_length
)
dev_sampler = SequentialSampler(dev_dataset)
dev_dl = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=args.b)
if test_ex is not None:
test_dataset = classifier.convert_to_tensors(
test_ex, max_seq_length=args.max_sentence_length, max_word_length=args.max_word_length
)
test_sampler = SequentialSampler(test_dataset)
test_dl = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.b)
if args.lr is not None:
opt = classifier.get_optimizer(lr=args.lr)
classifier.train(
train_dl,
dev_dl,
test_dl,
epochs=args.e,
batch_size=args.b,
logging_steps=args.logging_steps,
save_steps=args.save_steps,
save_path=args.output_dir,
optimizer=opt if opt is not None else None,
)
classifier.save_model(args.output_dir)
[docs]def do_kd_training(args):
prepare_output_path(args.output_dir, args.overwrite_output_dir)
device, n_gpus = setup_backend(args.no_cuda)
# Set seed
args.seed = set_seed(args.seed, n_gpus)
# prepare data
processor = TokenClsProcessor(args.data_dir, tag_col=args.tag_col)
train_ex = processor.get_train_examples()
dev_ex = processor.get_dev_examples()
test_ex = processor.get_test_examples()
vocab = processor.get_vocabulary()
vocab_size = len(vocab) + 1
num_labels = len(processor.get_labels()) + 1
# create an embedder
embedder_cls = MODEL_TYPE[args.model_type]
if args.config_file is not None:
embedder_model = embedder_cls.from_config(vocab_size, num_labels, args.config_file)
else:
embedder_model = embedder_cls(vocab_size, num_labels)
# load external word embeddings if present
if args.embedding_file is not None:
emb_dict = load_embedding_file(args.embedding_file)
emb_mat = get_embedding_matrix(emb_dict, vocab)
emb_mat = torch.tensor(emb_mat, dtype=torch.float)
embedder_model.load_embeddings(emb_mat)
classifier = NeuralTagger(
embedder_model,
word_vocab=vocab,
labels=processor.get_labels(),
use_crf=args.use_crf,
device=device,
n_gpus=n_gpus,
)
train_batch_size = args.b * max(1, n_gpus)
train_dataset = classifier.convert_to_tensors(
train_ex, max_seq_length=args.max_sentence_length, max_word_length=args.max_word_length
)
teacher = TransformerTokenClassifier.load_model(
model_path=args.teacher_model_path, model_type=args.teacher_model_type
)
teacher.to(device, n_gpus)
teacher_dataset = teacher.convert_to_tensors(train_ex, args.max_sentence_length, False)
train_dataset = ParallelDataset(train_dataset, teacher_dataset)
train_sampler = RandomSampler(train_dataset)
train_dl = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size)
if dev_ex is not None:
dev_dataset = classifier.convert_to_tensors(
dev_ex, max_seq_length=args.max_sentence_length, max_word_length=args.max_word_length
)
dev_sampler = SequentialSampler(dev_dataset)
dev_dl = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=args.b)
if test_ex is not None:
test_dataset = classifier.convert_to_tensors(
test_ex, max_seq_length=args.max_sentence_length, max_word_length=args.max_word_length
)
test_sampler = SequentialSampler(test_dataset)
test_dl = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.b)
if args.lr is not None:
opt = classifier.get_optimizer(lr=args.lr)
distiller = TeacherStudentDistill(
teacher, args.kd_temp, args.kd_dist_w, args.kd_student_w, args.kd_loss_fn
)
classifier.train(
train_dl,
dev_dl,
test_dl,
epochs=args.e,
batch_size=args.b,
logging_steps=args.logging_steps,
save_steps=args.save_steps,
save_path=args.output_dir,
optimizer=opt if opt is not None else None,
distiller=distiller,
)
classifier.save_model(args.output_dir)
[docs]def do_kd_pseudo_training(args):
prepare_output_path(args.output_dir, args.overwrite_output_dir)
device, n_gpus = setup_backend(args.no_cuda)
# Set seed
set_seed(args.seed, n_gpus)
# prepare data
processor = TokenClsProcessor(args.data_dir, tag_col=args.tag_col)
train_labeled_ex = processor.get_train_examples(filename=args.labeled_train_file)
train_unlabeled_ex = processor.get_train_examples(filename=args.unlabeled_train_file)
dev_ex = processor.get_dev_examples()
test_ex = processor.get_test_examples()
vocab = processor.get_vocabulary()
vocab_size = len(vocab) + 1
num_labels = len(processor.get_labels()) + 1
# create an embedder
embedder_cls = MODEL_TYPE[args.model_type]
if args.config_file is not None:
embedder_model = embedder_cls.from_config(vocab_size, num_labels, args.config_file)
else:
embedder_model = embedder_cls(vocab_size, num_labels)
# load external word embeddings if present
if args.embedding_file is not None:
emb_dict = load_embedding_file(args.embedding_file)
emb_mat = get_embedding_matrix(emb_dict, vocab)
emb_mat = torch.tensor(emb_mat, dtype=torch.float)
embedder_model.load_embeddings(emb_mat)
classifier = NeuralTagger(
embedder_model,
word_vocab=vocab,
labels=processor.get_labels(),
use_crf=args.use_crf,
device=device,
n_gpus=n_gpus,
)
train_batch_size = args.b * max(1, n_gpus)
train_batch_size_ul = args.b_ul * max(1, n_gpus)
train_labeled_dataset = classifier.convert_to_tensors(
train_labeled_ex,
max_seq_length=args.max_sentence_length,
max_word_length=args.max_word_length,
)
train_unlabeled_dataset = classifier.convert_to_tensors(
train_unlabeled_ex,
max_seq_length=args.max_sentence_length,
max_word_length=args.max_word_length,
include_labels=False,
)
teacher = TransformerTokenClassifier.load_model(
model_path=args.teacher_model_path, model_type=args.teacher_model_type
)
teacher.to(device, n_gpus)
teacher_labeled_dataset = teacher.convert_to_tensors(
train_labeled_ex, args.max_sentence_length, False
)
teacher_unlabeled_dataset = teacher.convert_to_tensors(
train_unlabeled_ex, args.max_sentence_length, False
)
train_labeled_dataset = ParallelDataset(train_labeled_dataset, teacher_labeled_dataset)
train_unlabeled_dataset = ParallelDataset(train_unlabeled_dataset, teacher_unlabeled_dataset)
train_labeled_sampler = RandomSampler(train_labeled_dataset)
train_unlabeled_sampler = RandomSampler(train_unlabeled_dataset)
train_labeled_dl = DataLoader(
train_labeled_dataset, sampler=train_labeled_sampler, batch_size=train_batch_size
)
train_unlabeled_dl = DataLoader(
train_unlabeled_dataset, sampler=train_unlabeled_sampler, batch_size=train_batch_size_ul
)
if dev_ex is not None:
dev_dataset = classifier.convert_to_tensors(
dev_ex, max_seq_length=args.max_sentence_length, max_word_length=args.max_word_length
)
dev_sampler = SequentialSampler(dev_dataset)
dev_dl = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=args.b)
if test_ex is not None:
test_dataset = classifier.convert_to_tensors(
test_ex, max_seq_length=args.max_sentence_length, max_word_length=args.max_word_length
)
test_sampler = SequentialSampler(test_dataset)
test_dl = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.b)
if args.lr is not None:
opt = classifier.get_optimizer(lr=args.lr)
distiller = TeacherStudentDistill(
teacher, args.kd_temp, args.kd_dist_w, args.kd_student_w, args.kd_loss_fn
)
classifier.train_pseudo(
train_labeled_dl,
train_unlabeled_dl,
distiller,
dev_dl,
test_dl,
batch_size_l=args.b,
batch_size_ul=args.b_ul,
epochs=args.e,
logging_steps=args.logging_steps,
save_steps=args.save_steps,
save_path=args.output_dir,
optimizer=opt if opt is not None else None,
)
classifier.save_model(args.output_dir)
[docs]def do_inference(args):
prepare_output_path(args.output_dir, args.overwrite_output_dir)
device, n_gpus = setup_backend(args.no_cuda)
args.batch_size = args.b * max(1, n_gpus)
inference_examples = process_inference_input(args.data_file)
classifier = NeuralTagger.load_model(model_path=args.model_dir)
classifier.to(device, n_gpus)
output = classifier.inference(inference_examples, args.b)
write_column_tagged_file(args.output_dir + os.sep + "output.txt", output)