Source code for pororo.tasks.named_entity_recognition

"""Named Entity Recognition related modeling class"""

from typing import List, Optional, Tuple

from pororo.tasks.utils.base import PororoFactoryBase, PororoSimpleBase


[docs]class PororoNerFactory(PororoFactoryBase): """ Conduct named entity recognition English (`roberta.base.en.ner`) - dataset: OntoNotes 5.0 - metric: F1 (91.63) Korean (`charbert.base.ko.ner`) - dataset: https://corpus.korean.go.kr/ 개체명 분석 말뭉치 - metric: F1 (89.63) Japanese (`jaberta.base.ja.ner`) - dataset: Kyoto University Web Document Leads Corpus - metric: F1 (76.74) - ref: https://github.com/ku-nlp/KWDLC Chinese (`zhberta.base.zh.ner`) - dataset: OntoNotes 5.0 - metric: F1 (79.06) Args: sent: (str) sentence to be sequence labeled Returns: List[Tuple[str, str]]: token and its predicted tag tuple list Examples: >>> ner = Pororo(task="ner") >>> ner("It was in midfield where Arsenal took control of the game, and that was mainly down to Thomas Partey and Mohamed Elneny.") [('It', 'O'), ('was', 'O'), ('in', 'O'), ('midfield', 'O'), ('where', 'O'), ('Arsenal', 'ORG'), ('took', 'O'), ('control', 'O'), ('of', 'O'), ('the', 'O'), ('game', 'O'), (',', 'O'), ('and', 'O'), ('that', 'O'), ('was', 'O'), ('mainly', 'O'), ('down', 'O'), ('to', 'O'), ('Thomas Partey', 'PERSON'), ('and', 'O'), ('Mohamed Elneny', 'PERSON'), ('.', 'O')] >>> ner = Pororo(task="ner", lang="ko") >>> ner("안녕하세요. 제 이름은 카터입니다.") [("안녕하세요.", "O"), (" ", "O"), ("제", "O"), ("이름은", "O"), ("카터", "PS"), ("입니다.", "O")] >>> ner = Pororo(task="ner", lang="zh") >>> ner("毛泽东(1893年12月26日-1976年9月9日),字润之,湖南湘潭人。中华民国大陆时期、中国共产党和中华人民共和国的重要政治家、经济家、军事家、战略家、外交家和诗人。") [('毛泽东', 'PERSON'), ('(', 'O'), ('1893年12月26日-1976年9月9日', 'DATE'), (')', 'O'), (',', 'O'), ('字润之', 'O'), (',', 'O'), ('湖南', 'GPE'), ('湘潭', 'GPE'), ('人', 'O'), ('。', 'O'), ('中华民国大陆时期', 'GPE'), ('、', 'O'), ('中国共产党', 'ORG'), ('和', 'O'), ('中华人民共和国', 'GPE'), ('的', 'O'), ('重', 'O'), ('要', 'O'), ('政', 'O'), ('治', 'O'), ('家', 'O'), ('、', 'O'), ('经', 'O'), ('济', 'O'), ('家', 'O'), ('、', 'O'), ('军', 'O'), ('事', 'O'), ('家', 'O'), ('、', 'O'), ('战', 'O'), ('略', 'O'), ('家', 'O'), ('、', 'O'), ('外', 'O'), ('交', 'O'), ('家', 'O'), ('和', 'O'), ('诗', 'O'), ('人', 'O'), ('。', 'O')] >>> ner = Pororo(task="ner", lang="ja") >>> ner("豊臣 秀吉、または羽柴 秀吉は、戦国時代から安土桃山時代にかけての武将、大名。天下人、武家関白、太閤。三英傑の一人。") [('豊臣秀吉', 'PERSON'), ('、', 'O'), ('または', 'O'), ('羽柴秀吉', 'PERSON'), ('は', 'O'), ('、', 'O'), ('戦国時代', 'DATE'), ('から', 'O'), ('安土桃山時代', 'DATE'), ('にかけて', 'O'), ('の', 'O'), ('武将', 'O'), ('、', 'O'), ('大名', 'O'), ('。', 'O'), ('天下', 'O'), ('人', 'O'), ('、', 'O'), ('武家', 'O'), ('関白', 'O'), ('、太閤', 'O'), ('。', 'O'), ('三', 'O'), ('英', 'O'), ('傑', 'O'), ('の', 'O'), ('一', 'O'), ('人', 'O'), ('。', 'O')] """ def __init__(self, task: str, lang: str, model: Optional[str]): super().__init__(task, lang, model)
[docs] @staticmethod def get_available_langs(): return ["en", "ko", "zh", "ja"]
[docs] @staticmethod def get_available_models(): return { "en": ["roberta.base.en.ner"], "ko": ["charbert.base.ko.ner"], "zh": ["zhberta.base.zh.ner"], "ja": ["jaberta.base.ja.ner"], }
[docs] def load(self, device): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if "roberta" in self.config.n_model: from pororo.models.brainbert import CustomRobertaModel model = (CustomRobertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device)) return PororoBertNerEn(model, self.config) if "charbert" in self.config.n_model: from pororo.models.brainbert import CharBrainRobertaModel from pororo.tasks.tokenization import PororoTokenizationFactory model = (CharBrainRobertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device)) sent_tokenizer = PororoTokenizationFactory( task="tokenization", model="sent_ko", lang=self.config.lang, ).load(device) return PororoBertCharNer( model, sent_tokenizer, device, self.config, ) if "zhberta" in self.config.n_model: from pororo.models.brainbert import ZhbertaModel model = (ZhbertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device)) return PororoBertNerZh(model, self.config) if "jaberta" in self.config.n_model: from pororo.models.brainbert import JabertaModel model = (JabertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device)) return PororoBertNerJa(model, self.config)
[docs]class PororoBertNerEn(PororoSimpleBase): def __init__(self, model, config): super().__init__(config) self._model = model def _postprocess(self, tags: List[str]): """ Postprocess NER tags to concatenate BIO Args: tags (List[str]): inferenced tag list Returns: List[str]: postprocessed BIO scheme tag list """ def _remove_tail(tag): if "-" in tag: tag = tag[2:] return tag result = list() word = tags[0][0] tag = tags[0][1] for pair in tags[1:]: token, label = pair if "I" in label: word += token else: word = word.strip() if word.endswith("."): result.append((word[:-1], _remove_tail(tag))) result.append((".", "O")) else: result.append((word, _remove_tail(tag))) word = token tag = label word = word.strip() if word.endswith("."): result.append((word[:-1], _remove_tail(tag))) result.append((".", "O")) else: result.append((word, _remove_tail(tag))) return [pair for pair in result if pair[0] != ""]
[docs] def predict(self, sent: str): """ Conduct named entity recognition with english RoBERTa Args: sent: (str) sentence to be sequence labeled Returns: List[Tuple[str, str]]: token and its predicted tag tuple list """ return self._postprocess(self._model.predict_tags(sent))
[docs]class PororoBertCharNer(PororoSimpleBase): def __init__( self, model, sent_tokenizer, device, config, ): super().__init__(config) self._model = model self._sent_tokenizer = sent_tokenizer self._tag = { "PS": "PERSON", "LC": "LOCATION", "OG": "ORGANIZATION", "AF": "ARTIFACT", "DT": "DATE", "TI": "TIME", "CV": "CIVILIZATION", "AM": "ANIMAL", "PT": "PLANT", "QT": "QUANTITY", "FD": "STUDY_FIELD", "TR": "THEORY", "EV": "EVENT", "MT": "MATERIAL", "TM": "TERM", } self._device = device def _postprocess(self, tags: List[Tuple[str, str]]): """ Postprocess characted tags to concatenate BIO Args: tags (List[Tuple[str, str]]): characted token and its corresponding tag tuple list Returns: List(Tuple[str, str]): postprocessed entity token and its corresponding tag tuple list """ def _remove_tail(tag: str): if "-" in tag: tag = tag[:-2] return tag result = list() tmp_word = tags[0][0] prev_ori_tag = tags[0][1] prev_tag = _remove_tail(prev_ori_tag) for i, pair in enumerate(tags[1:]): char = pair[0] ori_tag = pair[1] tag = _remove_tail(ori_tag) if ("▁" in char) and ("-I" not in ori_tag): result.append((tmp_word, prev_tag)) result.append((" ", "O")) tmp_word = char prev_ori_tag = ori_tag prev_tag = tag continue if (tag == prev_tag) and (("-I" in ori_tag) or "O" in ori_tag): tmp_word += char elif (tag != prev_tag) and ("-I" in ori_tag) and (tag != "O"): tmp_word += char else: result.append((tmp_word, prev_tag)) tmp_word = char prev_ori_tag = ori_tag prev_tag = tag result.append((tmp_word, prev_tag)) result = [(pair[0].replace("▁", " ").strip(), pair[1]) if pair[0] != " " else (" ", "O") for pair in result] return result
[docs] def predict( self, text: str, ignore_labels: List[str] = [], ): """ Conduct named entity recognition with character BERT Args: text: (str) sentence to be sequence labeled Returns: List[Tuple[str, str]]: token and its predicted tag tuple list """ texts = text.strip().split("\n") result = [] for text in texts: for sent in self._sent_tokenizer(text.strip()): res = self._model.predict_tags(sent) res = [ pair for pair in self._postprocess(res) if pair[1] not in ignore_labels ] res = [( pair[0], self._tag[pair[1]], ) if pair[1] in self._tag else pair for pair in res] result.extend(res) result.extend([(" ", "O")]) return result[:-1]
def __call__( self, text: str, ignore_labels: List[str] = [], ): return self.predict(text, ignore_labels)
[docs]class PororoBertNerZh(PororoSimpleBase): def __init__(self, model, config): super().__init__(config) self._model = model def _postprocess( self, tags: List[str], ): """ Postprocess NER tags to concatenate BIO Args: tags (List[str]): inferenced tag list Returns: List[str]: postprocessed BIO scheme tag list """ def _remove_tail(tag): if "-" in tag: tag = tag[2:] return tag result = list() word = tags[0][0] tag = tags[0][1] for pair in tags[1:]: token, label = pair if "I" in label: word += token else: word = word.strip() result.append((word, _remove_tail(tag))) word = token tag = label word = word.strip() result.append((word, _remove_tail(tag))) return result
[docs] def predict(self, sent: str): """ Conduct named entity recognition with Chinese RoBERTa Args: sent: (str) sentence to be sequence labeled Returns: List[Tuple[str, str]]: token and its predicted tag tuple list """ tags = self._model.predict_tags(sent) return self._postprocess(tags)
[docs]class PororoBertNerJa(PororoSimpleBase): def __init__(self, model, config): super().__init__(config) self._model = model def _postprocess( self, tags: List[str], ): """ Postprocess NER tags to concatenate BIO Args: tags (List[str]): inferenced tag list Returns: List[str]: postprocessed BIO scheme tag list """ def _remove_tail(tag): if "-" in tag: tag = tag[2:] return tag result = list() word = tags[0][0] tag = tags[0][1] for pair in tags[1:]: token, label = pair if "I" in label: word += token else: word = word.strip() result.append((word.replace("##", ""), _remove_tail(tag))) word = token tag = label word = word.strip() result.append((word.replace("##", ""), _remove_tail(tag))) return result
[docs] def predict(self, sent: str): """ Conduct named entity recognition with Japanese RoBERTa Args: sent: (str) sentence to be sequence labeled Returns: List[Tuple[str, str]]: token and its predicted tag tuple list """ return self._postprocess(self._model.predict_tags(sent))