Module ktrain.text.ner.anago.preprocessing

Preprocessors.

Expand source code
# -*- coding: utf-8 -*-
"""
Preprocessors.
"""

from ....imports import *
from .... import utils as U
from .utils import Vocabulary


try:
    from allennlp.modules.elmo import Elmo, batch_to_ids
    ALLENNLP_INSTALLED = True
except:
    ALLENNLP_INSTALLED = False


options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json'
weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'



def normalize_number(text):
    return re.sub(r'[0-90123456789]', r'0', text)


class IndexTransformer(BaseEstimator, TransformerMixin):
    """Convert a collection of raw documents to a document id matrix.

    Attributes:
        _use_char: boolean. Whether to use char feature.
        _num_norm: boolean. Whether to normalize text.
        _word_vocab: dict. A mapping of words to feature indices.
        _char_vocab: dict. A mapping of chars to feature indices.
        _label_vocab: dict. A mapping of labels to feature indices.
    """

    def __init__(self, lower=True, num_norm=True,
                 use_char=True, initial_vocab=None,
                 use_elmo=False):
        """Create a preprocessor object.

        Args:
            lower: boolean. Whether to convert the texts to lowercase.
            use_char: boolean. Whether to use char feature.
            num_norm: boolean. Whether to normalize text.
            initial_vocab: Iterable. Initial vocabulary for expanding word_vocab.
            use_elmo: If True, will generate contextual English Elmo embeddings
        """
        self._num_norm = num_norm
        self._use_char = use_char
        self._word_vocab = Vocabulary(lower=lower)
        self._char_vocab = Vocabulary(lower=False)
        self._label_vocab = Vocabulary(lower=False, unk_token=False)

        if initial_vocab:
            self._word_vocab.add_documents([initial_vocab])
            self._char_vocab.add_documents(initial_vocab)

        self.elmo = None  # elmo embedding model
        self.use_elmo = False
        self.te = None    # transformer embedding model
        self.te_layers = U.DEFAULT_TRANSFORMER_LAYERS
        self.te_model = None
        self._blacklist = ['te', 'elmo']

    def __getstate__(self):
        return {k: v for k, v in self.__dict__.items() if k not in self._blacklist}

    def __setstate__(self, state):
        self.__dict__.update(state)
        if not hasattr(self, 'te_model'): self.te_model = None
        if not hasattr(self, 'use_elmo'): self.use_elmo = False
        if not hasattr(self, 'te_layers'): self.te_layers = U.DEFAULT_TRANSFORMER_LAYERS

        if self.te_model is not None: self.activate_transformer(self.te_model, layers=self.te_layers)
        else:
            self.te = None
        if self.use_elmo:  
            self.activate_elmo()
        else:
            self.elmo = None


    def activate_elmo(self):
        if not ALLENNLP_INSTALLED:
            raise Exception(ALLENNLP_ERRMSG)

        if not hasattr(self, 'elmo'): self.elmo=None
        if self.elmo is None:
            self.elmo = Elmo(options_file, weight_file, 2, dropout=0)
        self.use_elmo = True

    def activate_transformer(self, model_name, layers=U.DEFAULT_TRANSFORMER_LAYERS, 
                              force=False):
        from ...preprocessor import TransformerEmbedding
        if not hasattr(self, 'te'): self.te = None
        if self.te is None or self.te_model != model_name or force:
            self.te_model = model_name
            self.te = TransformerEmbedding(model_name, layers=layers)
        self.te_layers = layers

    def get_transformer_dim(self):
        if not self.transformer_is_activated(): 
            return None
        else:
            return self.te.embsize


    def elmo_is_activated(self):
        return self.elmo is not None


    def transformer_is_activated(self):
        return self.te is not None

            
    def fix_tokenization(self, X, Y, maxlen=U.DEFAULT_TRANSFORMER_MAXLEN, num_special=U.DEFAULT_TRANSFORMER_NUM_SPECIAL):
        """
        Should be called prior training
        """
        if not self.transformer_is_activated():
            return X, Y
        ids2tok = self.te.tokenizer.convert_ids_to_tokens
        encode = self.te.tokenizer.encode
        new_X = []
        new_Y = []
        for i, x in enumerate(X):
            new_x = []
            new_y =[]
            seq_len = 0
            for j,s in enumerate(x):
                subtokens = ids2tok(encode(s, add_special_tokens=False))
                token_len = len(subtokens)
                if seq_len + token_len > (maxlen - num_special):
                    break
                seq_len += token_len
                hf_s = ' '.join(subtokens).replace(' ##', '').split()
                new_x.extend(hf_s)
                if Y is not None:
                    tag = Y[i][j]
                    new_y.extend([tag])
                    if len(hf_s) > 1:
                        new_tag = tag
                        if tag.startswith('B-'): new_tag = 'I-'+tag[2:]
                        new_y.extend([new_tag]*(len(hf_s)-1) )
                    #if tag.startswith('B-'): tag = 'I-'+tag[2:]

            new_X.append(new_x)
            new_Y.append(new_y)
        new_Y = None if Y is None else new_Y
        return new_X, new_Y


    def fit(self, X, y):
        """Learn vocabulary from training set.

        Args:
            X : iterable. An iterable which yields either str, unicode or file objects.

        Returns:
            self : IndexTransformer.
        """
        self._word_vocab.add_documents(X)
        self._label_vocab.add_documents(y)
        if self._use_char:
            for doc in X:
                self._char_vocab.add_documents(doc)

        self._word_vocab.build()
        self._char_vocab.build()
        self._label_vocab.build()

        return self


    def transform(self, X, y=None):
        """Transform documents to document ids.

        Uses the vocabulary learned by fit.

        Args:
            X : iterable
            an iterable which yields either str, unicode or file objects.
            y : iterabl, label strings.

        Returns:
            features: document id matrix.
            y: label id matrix.
        """
        # re-instantiate TransformerEmbedding/Elmo if necessary since it is excluded from pickling
        if self.te_model is not None: self.activate_transformer(self.te_model, layers=self.te_layers)
        if self.use_elmo: self.activate_elmo()


        features = []

        word_ids = [self._word_vocab.doc2id(doc) for doc in X]
        word_ids = sequence.pad_sequences(word_ids, padding='post')
        features.append(word_ids)

        if self._use_char:
            char_ids = [[self._char_vocab.doc2id(w) for w in doc] for doc in X]
            char_ids = pad_nested_sequences(char_ids)
            features.append(char_ids)

        if self.elmo is not None:
            if not ALLENNLP_INSTALLED:        
                raise Exception(ALLENNLP_ERRMSG)

            character_ids = batch_to_ids(X)
            elmo_embeddings = self.elmo(character_ids)['elmo_representations'][1]
            elmo_embeddings = elmo_embeddings.detach().numpy()
            features.append(elmo_embeddings)

        if self.te is not None:
            transformer_embeddings = self.te.embed(X, word_level=True)
            features.append(transformer_embeddings)


        if y is not None:
            y = [self._label_vocab.doc2id(doc) for doc in y]
            y = sequence.pad_sequences(y, padding='post')
            y = to_categorical(y, self.label_size).astype(int)
            # In 2018/06/01, to_categorical is a bit strange.
            # >>> to_categorical([[1,3]], num_classes=4).shape
            # (1, 2, 4)
            # >>> to_categorical([[1]], num_classes=4).shape
            # (1, 4)
            # So, I expand dimensions when len(y.shape) == 2.
            y = y if len(y.shape) == 3 else np.expand_dims(y, axis=0)
            return features, y
        else:
            return features


    def fit_transform(self, X, y=None, **params):
        """Learn vocabulary and return document id matrix.

        This is equivalent to fit followed by transform.

        Args:
            X : iterable
            an iterable which yields either str, unicode or file objects.

        Returns:
            list : document id matrix.
            list: label id matrix.
        """
        return self.fit(X, y).transform(X, y)

    def inverse_transform(self, y, lengths=None):
        """Return label strings.

        Args:
            y: label id matrix.
            lengths: sentences length.

        Returns:
            list: list of list of strings.
        """
        y = np.argmax(y, -1)
        inverse_y = [self._label_vocab.id2doc(ids) for ids in y]
        if lengths is not None:
            inverse_y = [iy[:l] for iy, l in zip(inverse_y, lengths)]

        return inverse_y

    @property
    def word_vocab_size(self):
        return len(self._word_vocab)

    @property
    def char_vocab_size(self):
        return len(self._char_vocab)

    @property
    def label_size(self):
        return len(self._label_vocab)

    def save(self, file_path):
        joblib.dump(self, file_path)

    @classmethod
    def load(cls, file_path):
        p = joblib.load(file_path)

        return p


def pad_nested_sequences(sequences, dtype='int32'):
    """Pads nested sequences to the same length.

    This function transforms a list of list sequences
    into a 3D Numpy array of shape `(num_samples, max_sent_len, max_word_len)`.

    Args:
        sequences: List of lists of lists.
        dtype: Type of the output sequences.

    # Returns
        x: Numpy array.
    """
    max_sent_len = 0
    max_word_len = 0
    for sent in sequences:
        max_sent_len = max(len(sent), max_sent_len)
        for word in sent:
            max_word_len = max(len(word), max_word_len)

    x = np.zeros((len(sequences), max_sent_len, max_word_len)).astype(dtype)
    for i, sent in enumerate(sequences):
        for j, word in enumerate(sent):
            x[i, j, :len(word)] = word

    return x

Functions

def normalize_number(text)
Expand source code
def normalize_number(text):
    return re.sub(r'[0-90123456789]', r'0', text)
def pad_nested_sequences(sequences, dtype='int32')

Pads nested sequences to the same length.

This function transforms a list of list sequences into a 3D Numpy array of shape (num_samples, max_sent_len, max_word_len).

Args

sequences
List of lists of lists.
dtype
Type of the output sequences.

Returns

x: Numpy array.
Expand source code
def pad_nested_sequences(sequences, dtype='int32'):
    """Pads nested sequences to the same length.

    This function transforms a list of list sequences
    into a 3D Numpy array of shape `(num_samples, max_sent_len, max_word_len)`.

    Args:
        sequences: List of lists of lists.
        dtype: Type of the output sequences.

    # Returns
        x: Numpy array.
    """
    max_sent_len = 0
    max_word_len = 0
    for sent in sequences:
        max_sent_len = max(len(sent), max_sent_len)
        for word in sent:
            max_word_len = max(len(word), max_word_len)

    x = np.zeros((len(sequences), max_sent_len, max_word_len)).astype(dtype)
    for i, sent in enumerate(sequences):
        for j, word in enumerate(sent):
            x[i, j, :len(word)] = word

    return x

Classes

class IndexTransformer (lower=True, num_norm=True, use_char=True, initial_vocab=None, use_elmo=False)

Convert a collection of raw documents to a document id matrix.

Attributes

_use_char
boolean. Whether to use char feature.
_num_norm
boolean. Whether to normalize text.
_word_vocab
dict. A mapping of words to feature indices.
_char_vocab
dict. A mapping of chars to feature indices.
_label_vocab
dict. A mapping of labels to feature indices.

Create a preprocessor object.

Args

lower
boolean. Whether to convert the texts to lowercase.
use_char
boolean. Whether to use char feature.
num_norm
boolean. Whether to normalize text.
initial_vocab
Iterable. Initial vocabulary for expanding word_vocab.
use_elmo
If True, will generate contextual English Elmo embeddings
Expand source code
class IndexTransformer(BaseEstimator, TransformerMixin):
    """Convert a collection of raw documents to a document id matrix.

    Attributes:
        _use_char: boolean. Whether to use char feature.
        _num_norm: boolean. Whether to normalize text.
        _word_vocab: dict. A mapping of words to feature indices.
        _char_vocab: dict. A mapping of chars to feature indices.
        _label_vocab: dict. A mapping of labels to feature indices.
    """

    def __init__(self, lower=True, num_norm=True,
                 use_char=True, initial_vocab=None,
                 use_elmo=False):
        """Create a preprocessor object.

        Args:
            lower: boolean. Whether to convert the texts to lowercase.
            use_char: boolean. Whether to use char feature.
            num_norm: boolean. Whether to normalize text.
            initial_vocab: Iterable. Initial vocabulary for expanding word_vocab.
            use_elmo: If True, will generate contextual English Elmo embeddings
        """
        self._num_norm = num_norm
        self._use_char = use_char
        self._word_vocab = Vocabulary(lower=lower)
        self._char_vocab = Vocabulary(lower=False)
        self._label_vocab = Vocabulary(lower=False, unk_token=False)

        if initial_vocab:
            self._word_vocab.add_documents([initial_vocab])
            self._char_vocab.add_documents(initial_vocab)

        self.elmo = None  # elmo embedding model
        self.use_elmo = False
        self.te = None    # transformer embedding model
        self.te_layers = U.DEFAULT_TRANSFORMER_LAYERS
        self.te_model = None
        self._blacklist = ['te', 'elmo']

    def __getstate__(self):
        return {k: v for k, v in self.__dict__.items() if k not in self._blacklist}

    def __setstate__(self, state):
        self.__dict__.update(state)
        if not hasattr(self, 'te_model'): self.te_model = None
        if not hasattr(self, 'use_elmo'): self.use_elmo = False
        if not hasattr(self, 'te_layers'): self.te_layers = U.DEFAULT_TRANSFORMER_LAYERS

        if self.te_model is not None: self.activate_transformer(self.te_model, layers=self.te_layers)
        else:
            self.te = None
        if self.use_elmo:  
            self.activate_elmo()
        else:
            self.elmo = None


    def activate_elmo(self):
        if not ALLENNLP_INSTALLED:
            raise Exception(ALLENNLP_ERRMSG)

        if not hasattr(self, 'elmo'): self.elmo=None
        if self.elmo is None:
            self.elmo = Elmo(options_file, weight_file, 2, dropout=0)
        self.use_elmo = True

    def activate_transformer(self, model_name, layers=U.DEFAULT_TRANSFORMER_LAYERS, 
                              force=False):
        from ...preprocessor import TransformerEmbedding
        if not hasattr(self, 'te'): self.te = None
        if self.te is None or self.te_model != model_name or force:
            self.te_model = model_name
            self.te = TransformerEmbedding(model_name, layers=layers)
        self.te_layers = layers

    def get_transformer_dim(self):
        if not self.transformer_is_activated(): 
            return None
        else:
            return self.te.embsize


    def elmo_is_activated(self):
        return self.elmo is not None


    def transformer_is_activated(self):
        return self.te is not None

            
    def fix_tokenization(self, X, Y, maxlen=U.DEFAULT_TRANSFORMER_MAXLEN, num_special=U.DEFAULT_TRANSFORMER_NUM_SPECIAL):
        """
        Should be called prior training
        """
        if not self.transformer_is_activated():
            return X, Y
        ids2tok = self.te.tokenizer.convert_ids_to_tokens
        encode = self.te.tokenizer.encode
        new_X = []
        new_Y = []
        for i, x in enumerate(X):
            new_x = []
            new_y =[]
            seq_len = 0
            for j,s in enumerate(x):
                subtokens = ids2tok(encode(s, add_special_tokens=False))
                token_len = len(subtokens)
                if seq_len + token_len > (maxlen - num_special):
                    break
                seq_len += token_len
                hf_s = ' '.join(subtokens).replace(' ##', '').split()
                new_x.extend(hf_s)
                if Y is not None:
                    tag = Y[i][j]
                    new_y.extend([tag])
                    if len(hf_s) > 1:
                        new_tag = tag
                        if tag.startswith('B-'): new_tag = 'I-'+tag[2:]
                        new_y.extend([new_tag]*(len(hf_s)-1) )
                    #if tag.startswith('B-'): tag = 'I-'+tag[2:]

            new_X.append(new_x)
            new_Y.append(new_y)
        new_Y = None if Y is None else new_Y
        return new_X, new_Y


    def fit(self, X, y):
        """Learn vocabulary from training set.

        Args:
            X : iterable. An iterable which yields either str, unicode or file objects.

        Returns:
            self : IndexTransformer.
        """
        self._word_vocab.add_documents(X)
        self._label_vocab.add_documents(y)
        if self._use_char:
            for doc in X:
                self._char_vocab.add_documents(doc)

        self._word_vocab.build()
        self._char_vocab.build()
        self._label_vocab.build()

        return self


    def transform(self, X, y=None):
        """Transform documents to document ids.

        Uses the vocabulary learned by fit.

        Args:
            X : iterable
            an iterable which yields either str, unicode or file objects.
            y : iterabl, label strings.

        Returns:
            features: document id matrix.
            y: label id matrix.
        """
        # re-instantiate TransformerEmbedding/Elmo if necessary since it is excluded from pickling
        if self.te_model is not None: self.activate_transformer(self.te_model, layers=self.te_layers)
        if self.use_elmo: self.activate_elmo()


        features = []

        word_ids = [self._word_vocab.doc2id(doc) for doc in X]
        word_ids = sequence.pad_sequences(word_ids, padding='post')
        features.append(word_ids)

        if self._use_char:
            char_ids = [[self._char_vocab.doc2id(w) for w in doc] for doc in X]
            char_ids = pad_nested_sequences(char_ids)
            features.append(char_ids)

        if self.elmo is not None:
            if not ALLENNLP_INSTALLED:        
                raise Exception(ALLENNLP_ERRMSG)

            character_ids = batch_to_ids(X)
            elmo_embeddings = self.elmo(character_ids)['elmo_representations'][1]
            elmo_embeddings = elmo_embeddings.detach().numpy()
            features.append(elmo_embeddings)

        if self.te is not None:
            transformer_embeddings = self.te.embed(X, word_level=True)
            features.append(transformer_embeddings)


        if y is not None:
            y = [self._label_vocab.doc2id(doc) for doc in y]
            y = sequence.pad_sequences(y, padding='post')
            y = to_categorical(y, self.label_size).astype(int)
            # In 2018/06/01, to_categorical is a bit strange.
            # >>> to_categorical([[1,3]], num_classes=4).shape
            # (1, 2, 4)
            # >>> to_categorical([[1]], num_classes=4).shape
            # (1, 4)
            # So, I expand dimensions when len(y.shape) == 2.
            y = y if len(y.shape) == 3 else np.expand_dims(y, axis=0)
            return features, y
        else:
            return features


    def fit_transform(self, X, y=None, **params):
        """Learn vocabulary and return document id matrix.

        This is equivalent to fit followed by transform.

        Args:
            X : iterable
            an iterable which yields either str, unicode or file objects.

        Returns:
            list : document id matrix.
            list: label id matrix.
        """
        return self.fit(X, y).transform(X, y)

    def inverse_transform(self, y, lengths=None):
        """Return label strings.

        Args:
            y: label id matrix.
            lengths: sentences length.

        Returns:
            list: list of list of strings.
        """
        y = np.argmax(y, -1)
        inverse_y = [self._label_vocab.id2doc(ids) for ids in y]
        if lengths is not None:
            inverse_y = [iy[:l] for iy, l in zip(inverse_y, lengths)]

        return inverse_y

    @property
    def word_vocab_size(self):
        return len(self._word_vocab)

    @property
    def char_vocab_size(self):
        return len(self._char_vocab)

    @property
    def label_size(self):
        return len(self._label_vocab)

    def save(self, file_path):
        joblib.dump(self, file_path)

    @classmethod
    def load(cls, file_path):
        p = joblib.load(file_path)

        return p

Ancestors

  • sklearn.base.BaseEstimator
  • sklearn.base.TransformerMixin

Static methods

def load(file_path)
Expand source code
@classmethod
def load(cls, file_path):
    p = joblib.load(file_path)

    return p

Instance variables

var char_vocab_size
Expand source code
@property
def char_vocab_size(self):
    return len(self._char_vocab)
var label_size
Expand source code
@property
def label_size(self):
    return len(self._label_vocab)
var word_vocab_size
Expand source code
@property
def word_vocab_size(self):
    return len(self._word_vocab)

Methods

def activate_elmo(self)
Expand source code
def activate_elmo(self):
    if not ALLENNLP_INSTALLED:
        raise Exception(ALLENNLP_ERRMSG)

    if not hasattr(self, 'elmo'): self.elmo=None
    if self.elmo is None:
        self.elmo = Elmo(options_file, weight_file, 2, dropout=0)
    self.use_elmo = True
def activate_transformer(self, model_name, layers=[-2], force=False)
Expand source code
def activate_transformer(self, model_name, layers=U.DEFAULT_TRANSFORMER_LAYERS, 
                          force=False):
    from ...preprocessor import TransformerEmbedding
    if not hasattr(self, 'te'): self.te = None
    if self.te is None or self.te_model != model_name or force:
        self.te_model = model_name
        self.te = TransformerEmbedding(model_name, layers=layers)
    self.te_layers = layers
def elmo_is_activated(self)
Expand source code
def elmo_is_activated(self):
    return self.elmo is not None
def fit(self, X, y)

Learn vocabulary from training set.

Args

X : iterable. An iterable which yields either str, unicode or file objects.

Returns

self
IndexTransformer.
Expand source code
def fit(self, X, y):
    """Learn vocabulary from training set.

    Args:
        X : iterable. An iterable which yields either str, unicode or file objects.

    Returns:
        self : IndexTransformer.
    """
    self._word_vocab.add_documents(X)
    self._label_vocab.add_documents(y)
    if self._use_char:
        for doc in X:
            self._char_vocab.add_documents(doc)

    self._word_vocab.build()
    self._char_vocab.build()
    self._label_vocab.build()

    return self
def fit_transform(self, X, y=None, **params)

Learn vocabulary and return document id matrix.

This is equivalent to fit followed by transform.

Args

X : iterable an iterable which yields either str, unicode or file objects.

Returns

list
document id matrix.
list
label id matrix.
Expand source code
def fit_transform(self, X, y=None, **params):
    """Learn vocabulary and return document id matrix.

    This is equivalent to fit followed by transform.

    Args:
        X : iterable
        an iterable which yields either str, unicode or file objects.

    Returns:
        list : document id matrix.
        list: label id matrix.
    """
    return self.fit(X, y).transform(X, y)
def fix_tokenization(self, X, Y, maxlen=512, num_special=2)

Should be called prior training

Expand source code
def fix_tokenization(self, X, Y, maxlen=U.DEFAULT_TRANSFORMER_MAXLEN, num_special=U.DEFAULT_TRANSFORMER_NUM_SPECIAL):
    """
    Should be called prior training
    """
    if not self.transformer_is_activated():
        return X, Y
    ids2tok = self.te.tokenizer.convert_ids_to_tokens
    encode = self.te.tokenizer.encode
    new_X = []
    new_Y = []
    for i, x in enumerate(X):
        new_x = []
        new_y =[]
        seq_len = 0
        for j,s in enumerate(x):
            subtokens = ids2tok(encode(s, add_special_tokens=False))
            token_len = len(subtokens)
            if seq_len + token_len > (maxlen - num_special):
                break
            seq_len += token_len
            hf_s = ' '.join(subtokens).replace(' ##', '').split()
            new_x.extend(hf_s)
            if Y is not None:
                tag = Y[i][j]
                new_y.extend([tag])
                if len(hf_s) > 1:
                    new_tag = tag
                    if tag.startswith('B-'): new_tag = 'I-'+tag[2:]
                    new_y.extend([new_tag]*(len(hf_s)-1) )
                #if tag.startswith('B-'): tag = 'I-'+tag[2:]

        new_X.append(new_x)
        new_Y.append(new_y)
    new_Y = None if Y is None else new_Y
    return new_X, new_Y
def get_transformer_dim(self)
Expand source code
def get_transformer_dim(self):
    if not self.transformer_is_activated(): 
        return None
    else:
        return self.te.embsize
def inverse_transform(self, y, lengths=None)

Return label strings.

Args

y
label id matrix.
lengths
sentences length.

Returns

list
list of list of strings.
Expand source code
def inverse_transform(self, y, lengths=None):
    """Return label strings.

    Args:
        y: label id matrix.
        lengths: sentences length.

    Returns:
        list: list of list of strings.
    """
    y = np.argmax(y, -1)
    inverse_y = [self._label_vocab.id2doc(ids) for ids in y]
    if lengths is not None:
        inverse_y = [iy[:l] for iy, l in zip(inverse_y, lengths)]

    return inverse_y
def save(self, file_path)
Expand source code
def save(self, file_path):
    joblib.dump(self, file_path)
def transform(self, X, y=None)

Transform documents to document ids.

Uses the vocabulary learned by fit.

Args

X : iterable an iterable which yields either str, unicode or file objects. y : iterabl, label strings.

Returns

features
document id matrix.
y
label id matrix.
Expand source code
def transform(self, X, y=None):
    """Transform documents to document ids.

    Uses the vocabulary learned by fit.

    Args:
        X : iterable
        an iterable which yields either str, unicode or file objects.
        y : iterabl, label strings.

    Returns:
        features: document id matrix.
        y: label id matrix.
    """
    # re-instantiate TransformerEmbedding/Elmo if necessary since it is excluded from pickling
    if self.te_model is not None: self.activate_transformer(self.te_model, layers=self.te_layers)
    if self.use_elmo: self.activate_elmo()


    features = []

    word_ids = [self._word_vocab.doc2id(doc) for doc in X]
    word_ids = sequence.pad_sequences(word_ids, padding='post')
    features.append(word_ids)

    if self._use_char:
        char_ids = [[self._char_vocab.doc2id(w) for w in doc] for doc in X]
        char_ids = pad_nested_sequences(char_ids)
        features.append(char_ids)

    if self.elmo is not None:
        if not ALLENNLP_INSTALLED:        
            raise Exception(ALLENNLP_ERRMSG)

        character_ids = batch_to_ids(X)
        elmo_embeddings = self.elmo(character_ids)['elmo_representations'][1]
        elmo_embeddings = elmo_embeddings.detach().numpy()
        features.append(elmo_embeddings)

    if self.te is not None:
        transformer_embeddings = self.te.embed(X, word_level=True)
        features.append(transformer_embeddings)


    if y is not None:
        y = [self._label_vocab.doc2id(doc) for doc in y]
        y = sequence.pad_sequences(y, padding='post')
        y = to_categorical(y, self.label_size).astype(int)
        # In 2018/06/01, to_categorical is a bit strange.
        # >>> to_categorical([[1,3]], num_classes=4).shape
        # (1, 2, 4)
        # >>> to_categorical([[1]], num_classes=4).shape
        # (1, 4)
        # So, I expand dimensions when len(y.shape) == 2.
        y = y if len(y.shape) == 3 else np.expand_dims(y, axis=0)
        return features, y
    else:
        return features
def transformer_is_activated(self)
Expand source code
def transformer_is_activated(self):
    return self.te is not None