Module ktrain.text.translation.core
Expand source code
from ...imports import *
from ... import utils as U
from .. import textutils as TU
from ...torch_base import TorchBase
SUPPORTED_SRC_LANGS = ['zh', 'ar', 'ru', 'de', 'af', 'es', 'fr', 'it', 'pt']
class Translator(TorchBase):
"""
Translator: basic wrapper around MarianMT model for language translation
"""
def __init__(self, model_name=None, device=None, quantize=False):
"""
```
basic wrapper around MarianMT model for language translation
Args:
model_name(str): Helsinki-NLP model
device(str): device to use (e.g., 'cuda', 'cpu')
quantize(bool): If True, use quantization.
```
"""
if 'Helsinki-NLP' not in model_name:
warnings.warn('Translator requires a Helsinki-NLP model: https://huggingface.co/Helsinki-NLP')
super().__init__(device=device, quantize=quantize)
from transformers import MarianMTModel, MarianTokenizer
self.tokenizer = MarianTokenizer.from_pretrained(model_name)
self.model = MarianMTModel.from_pretrained(model_name).to(self.torch_device)
if quantize: self.model = self.quantize_model(self.model)
def translate(self, src_text, join_with='\n', num_beams=None, early_stopping=None):
"""
```
Translate document (src_text).
To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
Args:
src_text(str): source text.
The source text can either be a single sentence or an entire document with multiple sentences
and paragraphs.
IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
If the input text is very large (e.g., an entire book), you should
break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and
feed each chunk separately into translate to avoid out-of-memory issues.
join_with(str): list of translated sentences will be delimited with this character.
default: each sentence on separate line
num_beams(int): Number of beams for beam search. Defaults to None. If None, the transformers library defaults this to 1,
whicn means no beam search.
early_stopping(bool): Whether to stop the beam search when at least ``num_beams`` sentences
are finished per batch or not. Defaults to None. If None, the transformers library
sets this to False.
Returns:
str: translated text
```
"""
sentences = TU.sent_tokenize(src_text)
tgt_sentences = self.translate_sentences(sentences, num_beams=num_beams, early_stopping=early_stopping)
return join_with.join(tgt_sentences)
def translate_sentences(self, sentences, num_beams=None, early_stopping=None):
"""
```
Translate sentences using model_name as model.
To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
Args:
sentences(list): list of strings representing sentences that need to be translated
IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
If the input text is very large (e.g., an entire book), you should
break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and
feed each chunk separately into translate to avoid out-of-memory issues.
num_beams(int): Number of beams for beam search. Defaults to None. If None, the transformers library defaults this to 1,
whicn means no beam search.
early_stopping(bool): Whether to stop the beam search when at least ``num_beams`` sentences
are finished per batch or not. Defaults to None. If None, the transformers library
sets this to False.
Returns:
str: translated sentences
```
"""
import torch
with torch.no_grad():
translated = self.model.generate(**self.tokenizer.prepare_seq2seq_batch(sentences, return_tensors='pt').to(self.torch_device),
num_beams=num_beams, early_stopping=early_stopping)
tgt_sentences = [self.tokenizer.decode(t, skip_special_tokens=True) for t in translated]
return tgt_sentences
class EnglishTranslator():
"""
Class to translate text in various languages to English.
"""
def __init__(self, src_lang=None, device=None, quantize=False):
"""
```
Constructor for English translator
Args:
src_lang(str): language code of source language.
Must be one of SUPPORTED_SRC_LANGS:
'zh': Chinese (either tradtional or simplified)
'ar': Arabic
'ru' : Russian
'de': German
'af': Afrikaans
'es': Spanish
'fr': French
'it': Italian
'pt': Portuguese
device(str): device to use (e.g., 'cuda', 'cpu')
quantize(bool): If True, use quantization.
```
"""
if src_lang is None or src_lang not in SUPPORTED_SRC_LANGS:
raise ValueError('A src_lang must be supplied and be one of: %s' % (SUPPORED_SRC_LANG))
self.src_lang = src_lang
self.translators = []
if src_lang == 'ar':
self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-ar-en', device=device, quantize=quantize))
elif src_lang == 'ru':
self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-ru-en', device=device, quantize=quantize))
elif src_lang == 'de':
self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-de-en', device=device, quantize=quantize))
elif src_lang == 'af':
self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-af-en', device=device, quantize=quantize))
elif src_lang in ['es', 'fr', 'it', 'pt']:
self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-ROMANCE-en', device=device, quantize=quantize))
#elif src_lang == 'zh': # could not find zh->en model, so currently doing two-step translation to English via German
#self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-ZH-de', device=device))
#self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-de-en', device=device))
elif src_lang == 'zh':
self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-zh-en', device=device, quantize=quantize))
else:
raise ValueError('lang:%s is currently not supported.' % (src_lang))
def translate(self, src_text, join_with='\n', num_beams=None, early_stopping=None):
"""
```
Translate source document to English.
To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
Args:
src_text(str): source text. Must be in language specified by src_lang (language code) supplied to constructor
The source text can either be a single sentence or an entire document with multiple sentences
and paragraphs.
IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
If the input text is very large (e.g., an entire book), you should
break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and
feed each chunk separately into translate to avoid out-of-memory issues.
join_with(str): list of translated sentences will be delimited with this character.
default: each sentence on separate line
num_beams(int): Number of beams for beam search. Defaults to None. If None, the transformers library defaults this to 1,
whicn means no beam search.
early_stopping(bool): Whether to stop the beam search when at least ``num_beams`` sentences
are finished per batch or not. Defaults to None. If None, the transformers library
sets this to False.
Returns:
str: translated text
```
"""
text = src_text
for t in self.translators:
text = t.translate(text, join_with=join_with, num_beams=num_beams, early_stopping=early_stopping)
return text
Classes
class EnglishTranslator (src_lang=None, device=None, quantize=False)
-
Class to translate text in various languages to English.
Constructor for English translator Args: src_lang(str): language code of source language. Must be one of SUPPORTED_SRC_LANGS: 'zh': Chinese (either tradtional or simplified) 'ar': Arabic 'ru' : Russian 'de': German 'af': Afrikaans 'es': Spanish 'fr': French 'it': Italian 'pt': Portuguese device(str): device to use (e.g., 'cuda', 'cpu') quantize(bool): If True, use quantization.
Expand source code
class EnglishTranslator(): """ Class to translate text in various languages to English. """ def __init__(self, src_lang=None, device=None, quantize=False): """ ``` Constructor for English translator Args: src_lang(str): language code of source language. Must be one of SUPPORTED_SRC_LANGS: 'zh': Chinese (either tradtional or simplified) 'ar': Arabic 'ru' : Russian 'de': German 'af': Afrikaans 'es': Spanish 'fr': French 'it': Italian 'pt': Portuguese device(str): device to use (e.g., 'cuda', 'cpu') quantize(bool): If True, use quantization. ``` """ if src_lang is None or src_lang not in SUPPORTED_SRC_LANGS: raise ValueError('A src_lang must be supplied and be one of: %s' % (SUPPORED_SRC_LANG)) self.src_lang = src_lang self.translators = [] if src_lang == 'ar': self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-ar-en', device=device, quantize=quantize)) elif src_lang == 'ru': self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-ru-en', device=device, quantize=quantize)) elif src_lang == 'de': self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-de-en', device=device, quantize=quantize)) elif src_lang == 'af': self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-af-en', device=device, quantize=quantize)) elif src_lang in ['es', 'fr', 'it', 'pt']: self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-ROMANCE-en', device=device, quantize=quantize)) #elif src_lang == 'zh': # could not find zh->en model, so currently doing two-step translation to English via German #self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-ZH-de', device=device)) #self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-de-en', device=device)) elif src_lang == 'zh': self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-zh-en', device=device, quantize=quantize)) else: raise ValueError('lang:%s is currently not supported.' % (src_lang)) def translate(self, src_text, join_with='\n', num_beams=None, early_stopping=None): """ ``` Translate source document to English. To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True). Args: src_text(str): source text. Must be in language specified by src_lang (language code) supplied to constructor The source text can either be a single sentence or an entire document with multiple sentences and paragraphs. IMPORTANT NOTE: Sentences are joined together and fed to model as single batch. If the input text is very large (e.g., an entire book), you should break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and feed each chunk separately into translate to avoid out-of-memory issues. join_with(str): list of translated sentences will be delimited with this character. default: each sentence on separate line num_beams(int): Number of beams for beam search. Defaults to None. If None, the transformers library defaults this to 1, whicn means no beam search. early_stopping(bool): Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not. Defaults to None. If None, the transformers library sets this to False. Returns: str: translated text ``` """ text = src_text for t in self.translators: text = t.translate(text, join_with=join_with, num_beams=num_beams, early_stopping=early_stopping) return text
Methods
def translate(self, src_text, join_with='\n', num_beams=None, early_stopping=None)
-
Translate source document to English. To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True). Args: src_text(str): source text. Must be in language specified by src_lang (language code) supplied to constructor The source text can either be a single sentence or an entire document with multiple sentences and paragraphs. IMPORTANT NOTE: Sentences are joined together and fed to model as single batch. If the input text is very large (e.g., an entire book), you should break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and feed each chunk separately into translate to avoid out-of-memory issues. join_with(str): list of translated sentences will be delimited with this character. default: each sentence on separate line num_beams(int): Number of beams for beam search. Defaults to None. If None, the transformers library defaults this to 1, whicn means no beam search. early_stopping(bool): Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not. Defaults to None. If None, the transformers library sets this to False. Returns: str: translated text
Expand source code
def translate(self, src_text, join_with='\n', num_beams=None, early_stopping=None): """ ``` Translate source document to English. To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True). Args: src_text(str): source text. Must be in language specified by src_lang (language code) supplied to constructor The source text can either be a single sentence or an entire document with multiple sentences and paragraphs. IMPORTANT NOTE: Sentences are joined together and fed to model as single batch. If the input text is very large (e.g., an entire book), you should break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and feed each chunk separately into translate to avoid out-of-memory issues. join_with(str): list of translated sentences will be delimited with this character. default: each sentence on separate line num_beams(int): Number of beams for beam search. Defaults to None. If None, the transformers library defaults this to 1, whicn means no beam search. early_stopping(bool): Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not. Defaults to None. If None, the transformers library sets this to False. Returns: str: translated text ``` """ text = src_text for t in self.translators: text = t.translate(text, join_with=join_with, num_beams=num_beams, early_stopping=early_stopping) return text
class Translator (model_name=None, device=None, quantize=False)
-
Translator: basic wrapper around MarianMT model for language translation
basic wrapper around MarianMT model for language translation Args: model_name(str): Helsinki-NLP model device(str): device to use (e.g., 'cuda', 'cpu') quantize(bool): If True, use quantization.
Expand source code
class Translator(TorchBase): """ Translator: basic wrapper around MarianMT model for language translation """ def __init__(self, model_name=None, device=None, quantize=False): """ ``` basic wrapper around MarianMT model for language translation Args: model_name(str): Helsinki-NLP model device(str): device to use (e.g., 'cuda', 'cpu') quantize(bool): If True, use quantization. ``` """ if 'Helsinki-NLP' not in model_name: warnings.warn('Translator requires a Helsinki-NLP model: https://huggingface.co/Helsinki-NLP') super().__init__(device=device, quantize=quantize) from transformers import MarianMTModel, MarianTokenizer self.tokenizer = MarianTokenizer.from_pretrained(model_name) self.model = MarianMTModel.from_pretrained(model_name).to(self.torch_device) if quantize: self.model = self.quantize_model(self.model) def translate(self, src_text, join_with='\n', num_beams=None, early_stopping=None): """ ``` Translate document (src_text). To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True). Args: src_text(str): source text. The source text can either be a single sentence or an entire document with multiple sentences and paragraphs. IMPORTANT NOTE: Sentences are joined together and fed to model as single batch. If the input text is very large (e.g., an entire book), you should break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and feed each chunk separately into translate to avoid out-of-memory issues. join_with(str): list of translated sentences will be delimited with this character. default: each sentence on separate line num_beams(int): Number of beams for beam search. Defaults to None. If None, the transformers library defaults this to 1, whicn means no beam search. early_stopping(bool): Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not. Defaults to None. If None, the transformers library sets this to False. Returns: str: translated text ``` """ sentences = TU.sent_tokenize(src_text) tgt_sentences = self.translate_sentences(sentences, num_beams=num_beams, early_stopping=early_stopping) return join_with.join(tgt_sentences) def translate_sentences(self, sentences, num_beams=None, early_stopping=None): """ ``` Translate sentences using model_name as model. To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True). Args: sentences(list): list of strings representing sentences that need to be translated IMPORTANT NOTE: Sentences are joined together and fed to model as single batch. If the input text is very large (e.g., an entire book), you should break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and feed each chunk separately into translate to avoid out-of-memory issues. num_beams(int): Number of beams for beam search. Defaults to None. If None, the transformers library defaults this to 1, whicn means no beam search. early_stopping(bool): Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not. Defaults to None. If None, the transformers library sets this to False. Returns: str: translated sentences ``` """ import torch with torch.no_grad(): translated = self.model.generate(**self.tokenizer.prepare_seq2seq_batch(sentences, return_tensors='pt').to(self.torch_device), num_beams=num_beams, early_stopping=early_stopping) tgt_sentences = [self.tokenizer.decode(t, skip_special_tokens=True) for t in translated] return tgt_sentences
Ancestors
Methods
def translate(self, src_text, join_with='\n', num_beams=None, early_stopping=None)
-
Translate document (src_text). To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True). Args: src_text(str): source text. The source text can either be a single sentence or an entire document with multiple sentences and paragraphs. IMPORTANT NOTE: Sentences are joined together and fed to model as single batch. If the input text is very large (e.g., an entire book), you should break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and feed each chunk separately into translate to avoid out-of-memory issues. join_with(str): list of translated sentences will be delimited with this character. default: each sentence on separate line num_beams(int): Number of beams for beam search. Defaults to None. If None, the transformers library defaults this to 1, whicn means no beam search. early_stopping(bool): Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not. Defaults to None. If None, the transformers library sets this to False. Returns: str: translated text
Expand source code
def translate(self, src_text, join_with='\n', num_beams=None, early_stopping=None): """ ``` Translate document (src_text). To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True). Args: src_text(str): source text. The source text can either be a single sentence or an entire document with multiple sentences and paragraphs. IMPORTANT NOTE: Sentences are joined together and fed to model as single batch. If the input text is very large (e.g., an entire book), you should break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and feed each chunk separately into translate to avoid out-of-memory issues. join_with(str): list of translated sentences will be delimited with this character. default: each sentence on separate line num_beams(int): Number of beams for beam search. Defaults to None. If None, the transformers library defaults this to 1, whicn means no beam search. early_stopping(bool): Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not. Defaults to None. If None, the transformers library sets this to False. Returns: str: translated text ``` """ sentences = TU.sent_tokenize(src_text) tgt_sentences = self.translate_sentences(sentences, num_beams=num_beams, early_stopping=early_stopping) return join_with.join(tgt_sentences)
def translate_sentences(self, sentences, num_beams=None, early_stopping=None)
-
Translate sentences using model_name as model. To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True). Args: sentences(list): list of strings representing sentences that need to be translated IMPORTANT NOTE: Sentences are joined together and fed to model as single batch. If the input text is very large (e.g., an entire book), you should break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and feed each chunk separately into translate to avoid out-of-memory issues. num_beams(int): Number of beams for beam search. Defaults to None. If None, the transformers library defaults this to 1, whicn means no beam search. early_stopping(bool): Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not. Defaults to None. If None, the transformers library sets this to False. Returns: str: translated sentences
Expand source code
def translate_sentences(self, sentences, num_beams=None, early_stopping=None): """ ``` Translate sentences using model_name as model. To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True). Args: sentences(list): list of strings representing sentences that need to be translated IMPORTANT NOTE: Sentences are joined together and fed to model as single batch. If the input text is very large (e.g., an entire book), you should break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and feed each chunk separately into translate to avoid out-of-memory issues. num_beams(int): Number of beams for beam search. Defaults to None. If None, the transformers library defaults this to 1, whicn means no beam search. early_stopping(bool): Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not. Defaults to None. If None, the transformers library sets this to False. Returns: str: translated sentences ``` """ import torch with torch.no_grad(): translated = self.model.generate(**self.tokenizer.prepare_seq2seq_batch(sentences, return_tensors='pt').to(self.torch_device), num_beams=num_beams, early_stopping=early_stopping) tgt_sentences = [self.tokenizer.decode(t, skip_special_tokens=True) for t in translated] return tgt_sentences
Inherited members