Module ktrain.text.ner.predictor
Expand source code
from ...imports import *
from ...predictor import Predictor
from .preprocessor import NERPreprocessor
from ... import utils as U
from .. import textutils as TU
class NERPredictor(Predictor):
"""
predicts classes for string-representation of sentence
"""
def __init__(self, model, preproc, batch_size=U.DEFAULT_BS):
if not isinstance(model, Model):
raise ValueError('model must be of instance Model')
if not isinstance(preproc, NERPreprocessor):
#if type(preproc).__name__ != 'NERPreprocessor':
raise ValueError('preproc must be a NERPreprocessor object')
self.model = model
self.preproc = preproc
self.c = self.preproc.get_classes()
self.batch_size = batch_size
def get_classes(self):
return self.c
def predict(self, sentences, return_proba=False, merge_tokens=False, custom_tokenizer=None):
"""
Makes predictions for a string-representation of a sentence
Args:
sentences(list|str): either a single sentence as a string or a list of sentences
return_proba(bool): If return_proba is True, returns probability distribution for each token
merge_tokens(bool): If True, tokens will be merged together by the entity
to which they are associated:
('Paul', 'B-PER'), ('Newman', 'I-PER') becomes ('Paul Newman', 'PER')
custom_tokenizer(Callable): If specified, sentence will be tokenized based on custom tokenizer
Returns:
list: If sentences is a string representation of single sentence:
list containing a tuple for each token in sentence
IF sentences is a list of sentences:
list of lists: Each inner list represents a sentence and contains a tuple for each token in sentence
"""
is_array = not isinstance(sentences, str)
if not isinstance(sentences, (str, list)):
raise ValueError('Param sentence must be either string-representation of a sentence or a list of sentence strings.')
if return_proba and merge_tokens:
raise ValueError('return_proba and merge_tokens are mutually exclusive with one another.')
if isinstance(sentences, str): sentences = [sentences]
lang = TU.detect_lang(sentences)
# batchify
num_chunks = math.ceil(len(sentences)/self.batch_size)
batches = U.list2chunks(sentences, n=num_chunks)
# process batches
results = []
for batch in batches:
nerseq = self.preproc.preprocess(batch, lang=lang, custom_tokenizer=custom_tokenizer)
if not nerseq.prepare_called:
nerseq.prepare()
nerseq.batch_size = len(batch)
x_true, _ = nerseq[0]
lengths = nerseq.get_lengths(0)
y_pred = self.model.predict_on_batch(x_true)
y_labels = self.preproc.p.inverse_transform(y_pred, lengths)
if return_proba:
try:
probs = np.max(y_pred, axis=2)
except:
probs = y_pred[0].numpy().tolist() # TODO: remove after confirmation (#316)
for x, y, prob in zip(nerseq.x, y_labels, probs):
result = [(x[i], y[i], prob[i]) for i in range(len(x))]
results.append(result)
else:
for x,y in zip(nerseq.x, y_labels):
result = list(zip(x,y))
if merge_tokens:
result = self.merge_tokens(result, lang)
results.append(result)
if not is_array: results = results[0]
return results
def merge_tokens(self, annotated_sentence, lang):
if TU.is_chinese(lang, strict=False): # strict=False: workaround for langdetect bug on short chinese texts
sep = ''
else:
sep = ' '
current_token = ""
current_tag = ""
entities = []
for tup in annotated_sentence:
token = tup[0]
entity = tup[1]
tag = entity.split('-')[1] if '-' in entity else None
prefix = entity.split('-')[0] if '-' in entity else None
# not within entity
if tag is None and not current_token:
continue
# beginning of entity
#elif tag and prefix=='B':
elif tag and (prefix=='B' or prefix=='I' and not current_token):
if current_token: # consecutive entities
entities.append((current_token, current_tag))
current_token = ""
current_tag = None
current_token = token
current_tag = tag
# end of entity
elif tag is None and current_token:
entities.append((current_token, current_tag))
current_token = ""
current_tag = None
continue
# within entity
elif tag and current_token: # prefix I
current_token = current_token + sep + token
current_tag = tag
if current_token and current_tag:
entities.append((current_token, current_tag))
return entities
Classes
class NERPredictor (model, preproc, batch_size=32)
-
predicts classes for string-representation of sentence
Expand source code
class NERPredictor(Predictor): """ predicts classes for string-representation of sentence """ def __init__(self, model, preproc, batch_size=U.DEFAULT_BS): if not isinstance(model, Model): raise ValueError('model must be of instance Model') if not isinstance(preproc, NERPreprocessor): #if type(preproc).__name__ != 'NERPreprocessor': raise ValueError('preproc must be a NERPreprocessor object') self.model = model self.preproc = preproc self.c = self.preproc.get_classes() self.batch_size = batch_size def get_classes(self): return self.c def predict(self, sentences, return_proba=False, merge_tokens=False, custom_tokenizer=None): """ Makes predictions for a string-representation of a sentence Args: sentences(list|str): either a single sentence as a string or a list of sentences return_proba(bool): If return_proba is True, returns probability distribution for each token merge_tokens(bool): If True, tokens will be merged together by the entity to which they are associated: ('Paul', 'B-PER'), ('Newman', 'I-PER') becomes ('Paul Newman', 'PER') custom_tokenizer(Callable): If specified, sentence will be tokenized based on custom tokenizer Returns: list: If sentences is a string representation of single sentence: list containing a tuple for each token in sentence IF sentences is a list of sentences: list of lists: Each inner list represents a sentence and contains a tuple for each token in sentence """ is_array = not isinstance(sentences, str) if not isinstance(sentences, (str, list)): raise ValueError('Param sentence must be either string-representation of a sentence or a list of sentence strings.') if return_proba and merge_tokens: raise ValueError('return_proba and merge_tokens are mutually exclusive with one another.') if isinstance(sentences, str): sentences = [sentences] lang = TU.detect_lang(sentences) # batchify num_chunks = math.ceil(len(sentences)/self.batch_size) batches = U.list2chunks(sentences, n=num_chunks) # process batches results = [] for batch in batches: nerseq = self.preproc.preprocess(batch, lang=lang, custom_tokenizer=custom_tokenizer) if not nerseq.prepare_called: nerseq.prepare() nerseq.batch_size = len(batch) x_true, _ = nerseq[0] lengths = nerseq.get_lengths(0) y_pred = self.model.predict_on_batch(x_true) y_labels = self.preproc.p.inverse_transform(y_pred, lengths) if return_proba: try: probs = np.max(y_pred, axis=2) except: probs = y_pred[0].numpy().tolist() # TODO: remove after confirmation (#316) for x, y, prob in zip(nerseq.x, y_labels, probs): result = [(x[i], y[i], prob[i]) for i in range(len(x))] results.append(result) else: for x,y in zip(nerseq.x, y_labels): result = list(zip(x,y)) if merge_tokens: result = self.merge_tokens(result, lang) results.append(result) if not is_array: results = results[0] return results def merge_tokens(self, annotated_sentence, lang): if TU.is_chinese(lang, strict=False): # strict=False: workaround for langdetect bug on short chinese texts sep = '' else: sep = ' ' current_token = "" current_tag = "" entities = [] for tup in annotated_sentence: token = tup[0] entity = tup[1] tag = entity.split('-')[1] if '-' in entity else None prefix = entity.split('-')[0] if '-' in entity else None # not within entity if tag is None and not current_token: continue # beginning of entity #elif tag and prefix=='B': elif tag and (prefix=='B' or prefix=='I' and not current_token): if current_token: # consecutive entities entities.append((current_token, current_tag)) current_token = "" current_tag = None current_token = token current_tag = tag # end of entity elif tag is None and current_token: entities.append((current_token, current_tag)) current_token = "" current_tag = None continue # within entity elif tag and current_token: # prefix I current_token = current_token + sep + token current_tag = tag if current_token and current_tag: entities.append((current_token, current_tag)) return entities
Ancestors
- Predictor
- abc.ABC
Methods
def get_classes(self)
-
Expand source code
def get_classes(self): return self.c
def merge_tokens(self, annotated_sentence, lang)
-
Expand source code
def merge_tokens(self, annotated_sentence, lang): if TU.is_chinese(lang, strict=False): # strict=False: workaround for langdetect bug on short chinese texts sep = '' else: sep = ' ' current_token = "" current_tag = "" entities = [] for tup in annotated_sentence: token = tup[0] entity = tup[1] tag = entity.split('-')[1] if '-' in entity else None prefix = entity.split('-')[0] if '-' in entity else None # not within entity if tag is None and not current_token: continue # beginning of entity #elif tag and prefix=='B': elif tag and (prefix=='B' or prefix=='I' and not current_token): if current_token: # consecutive entities entities.append((current_token, current_tag)) current_token = "" current_tag = None current_token = token current_tag = tag # end of entity elif tag is None and current_token: entities.append((current_token, current_tag)) current_token = "" current_tag = None continue # within entity elif tag and current_token: # prefix I current_token = current_token + sep + token current_tag = tag if current_token and current_tag: entities.append((current_token, current_tag)) return entities
def predict(self, sentences, return_proba=False, merge_tokens=False, custom_tokenizer=None)
-
Makes predictions for a string-representation of a sentence
Args
sentences(list|str): either a single sentence as a string or a list of sentences return_proba(bool): If return_proba is True, returns probability distribution for each token merge_tokens(bool): If True, tokens will be merged together by the entity to which they are associated: ('Paul', 'B-PER'), ('Newman', 'I-PER') becomes ('Paul Newman', 'PER') custom_tokenizer(Callable): If specified, sentence will be tokenized based on custom tokenizer
Returns
list
- If sentences is a string representation of single sentence: list containing a tuple for each token in sentence IF sentences is a list of sentences: list of lists: Each inner list represents a sentence and contains a tuple for each token in sentence
Expand source code
def predict(self, sentences, return_proba=False, merge_tokens=False, custom_tokenizer=None): """ Makes predictions for a string-representation of a sentence Args: sentences(list|str): either a single sentence as a string or a list of sentences return_proba(bool): If return_proba is True, returns probability distribution for each token merge_tokens(bool): If True, tokens will be merged together by the entity to which they are associated: ('Paul', 'B-PER'), ('Newman', 'I-PER') becomes ('Paul Newman', 'PER') custom_tokenizer(Callable): If specified, sentence will be tokenized based on custom tokenizer Returns: list: If sentences is a string representation of single sentence: list containing a tuple for each token in sentence IF sentences is a list of sentences: list of lists: Each inner list represents a sentence and contains a tuple for each token in sentence """ is_array = not isinstance(sentences, str) if not isinstance(sentences, (str, list)): raise ValueError('Param sentence must be either string-representation of a sentence or a list of sentence strings.') if return_proba and merge_tokens: raise ValueError('return_proba and merge_tokens are mutually exclusive with one another.') if isinstance(sentences, str): sentences = [sentences] lang = TU.detect_lang(sentences) # batchify num_chunks = math.ceil(len(sentences)/self.batch_size) batches = U.list2chunks(sentences, n=num_chunks) # process batches results = [] for batch in batches: nerseq = self.preproc.preprocess(batch, lang=lang, custom_tokenizer=custom_tokenizer) if not nerseq.prepare_called: nerseq.prepare() nerseq.batch_size = len(batch) x_true, _ = nerseq[0] lengths = nerseq.get_lengths(0) y_pred = self.model.predict_on_batch(x_true) y_labels = self.preproc.p.inverse_transform(y_pred, lengths) if return_proba: try: probs = np.max(y_pred, axis=2) except: probs = y_pred[0].numpy().tolist() # TODO: remove after confirmation (#316) for x, y, prob in zip(nerseq.x, y_labels, probs): result = [(x[i], y[i], prob[i]) for i in range(len(x))] results.append(result) else: for x,y in zip(nerseq.x, y_labels): result = list(zip(x,y)) if merge_tokens: result = self.merge_tokens(result, lang) results.append(result) if not is_array: results = results[0] return results
Inherited members