Module ktrain.text.shallownlp.ner
Expand source code
from .imports import *
class NER:
def __init__(self, lang='en', predictor_path=None):
"""
```
pretrained NER.
Only English and Chinese are currenty supported.
Args:
lang(str): Currently, one of {'en', 'zh', 'ru'}: en=English , zh=Chinese, or ru=Russian
```
"""
if lang is None:
raise ValueError('lang is required (e.g., "en" for English, "zh" for Chinese, "ru" for Russian, etc.')
if predictor_path is None and lang not in ['en', 'zh', 'ru']:
raise ValueError("Unsupported language: if predictor_path is None, then lang must be " +\
"'en' for English, 'zh' for Chinese, or 'ru' for Chinese")
self.lang = lang
if os.environ.get('DISABLE_V2_BEHAVIOR', None) != '1':
warnings.warn("Please add os.environ['DISABLE_V2_BEHAVIOR'] = '1' at top of your script or notebook")
msg = "\nNER in ktrain uses the CRF module from keras_contrib, which is not yet\n" +\
"fully compatible with TensorFlow 2. To use NER, you must add the following to the top of your\n" +\
"script or notebook BEFORE you import ktrain (after restarting runtime):\n\n" +\
"import os\n" +\
"os.environ['DISABLE_V2_BEHAVIOR'] = '1'\n"
print(msg)
return
else:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
if predictor_path is None and self.lang == 'zh':
dirpath = os.path.dirname(os.path.abspath(__file__))
fpath = os.path.join(dirpath, 'ner_models/ner_chinese')
elif predictor_path is None and self.lang == 'ru':
dirpath = os.path.dirname(os.path.abspath(__file__))
fpath = os.path.join(dirpath, 'ner_models/ner_russian')
elif predictor_path is None and self.lang=='en':
dirpath = os.path.dirname(os.path.abspath(__file__))
fpath = os.path.join(dirpath, 'ner_models/ner_english')
elif predictor_path is None:
raise ValueError("Unsupported language: if predictor_path is None, then lang must be " +\
"'en' for English, 'zh' for Chinese, or 'ru' for Chinese")
else:
if not os.path.isfile(predictor_path) or not os.path.isfile(predictor_path +'.preproc'):
raise ValueError('could not find a valid predictor model '+\
'%s or valid Preprocessor %s at specified path' % (predictor_path, predictor_path+'.preproc'))
fpath = predictor_path
try:
import io
from contextlib import redirect_stdout
f = io.StringIO()
with redirect_stdout(f):
import ktrain
except:
raise ValueError('ktrain could not be imported. Install with: pip install ktrain')
self.predictor = ktrain.load_predictor(fpath)
def predict(self, texts, merge_tokens=True, batch_size=32):
"""
```
Extract named entities from supplied text
Args:
texts (list of str or str): list of texts to annotate
merge_tokens(bool): If True, tokens will be merged together by the entity
to which they are associated:
('Paul', 'B-PER'), ('Newman', 'I-PER') becomes ('Paul Newman', 'PER')
batch_size(int): Batch size to use for predictions (default:32)
```
"""
if isinstance(texts, str): texts = [texts]
self.predictor.batch_size = batch_size
texts = [t.strip() for t in texts]
results = self.predictor.predict(texts, merge_tokens=merge_tokens)
if len(results) == 1: results = results[0]
return results
# 2020-04-30: moved to text.ner.predictor
#def merge_tokens(self, annotated_sentence):
# if self.lang.startswith('zh'):
# sep = ''
# else:
# sep = ' '
# current_token = ""
# current_tag = ""
# entities = []
# for tup in annotated_sentence:
# token = tup[0]
# entity = tup[1]
# tag = entity.split('-')[1] if '-' in entity else None
# prefix = entity.split('-')[0] if '-' in entity else None
# # not within entity
# if tag is None and not current_token:
# continue
# # beginning of entity
# #elif tag and prefix=='B':
# elif tag and (prefix=='B' or prefix=='I' and not current_token):
# if current_token: # consecutive entities
# entities.append((current_token, current_tag))
# current_token = ""
# current_tag = None
# current_token = token
# current_tag = tag
# # end of entity
# elif tag is None and current_token:
# entities.append((current_token, current_tag))
# current_token = ""
# current_tag = None
# continue
# # within entity
# elif tag and current_token: # prefix I
# current_token = current_token + sep + token
# current_tag = tag
# return entities
Classes
class NER (lang='en', predictor_path=None)
-
pretrained NER. Only English and Chinese are currenty supported. Args: lang(str): Currently, one of {'en', 'zh', 'ru'}: en=English , zh=Chinese, or ru=Russian
Expand source code
class NER: def __init__(self, lang='en', predictor_path=None): """ ``` pretrained NER. Only English and Chinese are currenty supported. Args: lang(str): Currently, one of {'en', 'zh', 'ru'}: en=English , zh=Chinese, or ru=Russian ``` """ if lang is None: raise ValueError('lang is required (e.g., "en" for English, "zh" for Chinese, "ru" for Russian, etc.') if predictor_path is None and lang not in ['en', 'zh', 'ru']: raise ValueError("Unsupported language: if predictor_path is None, then lang must be " +\ "'en' for English, 'zh' for Chinese, or 'ru' for Chinese") self.lang = lang if os.environ.get('DISABLE_V2_BEHAVIOR', None) != '1': warnings.warn("Please add os.environ['DISABLE_V2_BEHAVIOR'] = '1' at top of your script or notebook") msg = "\nNER in ktrain uses the CRF module from keras_contrib, which is not yet\n" +\ "fully compatible with TensorFlow 2. To use NER, you must add the following to the top of your\n" +\ "script or notebook BEFORE you import ktrain (after restarting runtime):\n\n" +\ "import os\n" +\ "os.environ['DISABLE_V2_BEHAVIOR'] = '1'\n" print(msg) return else: import tensorflow.compat.v1 as tf tf.disable_v2_behavior() if predictor_path is None and self.lang == 'zh': dirpath = os.path.dirname(os.path.abspath(__file__)) fpath = os.path.join(dirpath, 'ner_models/ner_chinese') elif predictor_path is None and self.lang == 'ru': dirpath = os.path.dirname(os.path.abspath(__file__)) fpath = os.path.join(dirpath, 'ner_models/ner_russian') elif predictor_path is None and self.lang=='en': dirpath = os.path.dirname(os.path.abspath(__file__)) fpath = os.path.join(dirpath, 'ner_models/ner_english') elif predictor_path is None: raise ValueError("Unsupported language: if predictor_path is None, then lang must be " +\ "'en' for English, 'zh' for Chinese, or 'ru' for Chinese") else: if not os.path.isfile(predictor_path) or not os.path.isfile(predictor_path +'.preproc'): raise ValueError('could not find a valid predictor model '+\ '%s or valid Preprocessor %s at specified path' % (predictor_path, predictor_path+'.preproc')) fpath = predictor_path try: import io from contextlib import redirect_stdout f = io.StringIO() with redirect_stdout(f): import ktrain except: raise ValueError('ktrain could not be imported. Install with: pip install ktrain') self.predictor = ktrain.load_predictor(fpath) def predict(self, texts, merge_tokens=True, batch_size=32): """ ``` Extract named entities from supplied text Args: texts (list of str or str): list of texts to annotate merge_tokens(bool): If True, tokens will be merged together by the entity to which they are associated: ('Paul', 'B-PER'), ('Newman', 'I-PER') becomes ('Paul Newman', 'PER') batch_size(int): Batch size to use for predictions (default:32) ``` """ if isinstance(texts, str): texts = [texts] self.predictor.batch_size = batch_size texts = [t.strip() for t in texts] results = self.predictor.predict(texts, merge_tokens=merge_tokens) if len(results) == 1: results = results[0] return results
Methods
def predict(self, texts, merge_tokens=True, batch_size=32)
-
Extract named entities from supplied text Args: texts (list of str or str): list of texts to annotate merge_tokens(bool): If True, tokens will be merged together by the entity to which they are associated: ('Paul', 'B-PER'), ('Newman', 'I-PER') becomes ('Paul Newman', 'PER') batch_size(int): Batch size to use for predictions (default:32)
Expand source code
def predict(self, texts, merge_tokens=True, batch_size=32): """ ``` Extract named entities from supplied text Args: texts (list of str or str): list of texts to annotate merge_tokens(bool): If True, tokens will be merged together by the entity to which they are associated: ('Paul', 'B-PER'), ('Newman', 'I-PER') becomes ('Paul Newman', 'PER') batch_size(int): Batch size to use for predictions (default:32) ``` """ if isinstance(texts, str): texts = [texts] self.predictor.batch_size = batch_size texts = [t.strip() for t in texts] results = self.predictor.predict(texts, merge_tokens=merge_tokens) if len(results) == 1: results = results[0] return results