Module ktrain.text.translation.core

Expand source code
from ...imports import *
from ... import utils as U
from .. import textutils as TU
from ...torch_base import TorchBase

SUPPORTED_SRC_LANGS = ['zh', 'ar', 'ru', 'de', 'af', 'es', 'fr', 'it', 'pt']

class Translator(TorchBase):
    """
    Translator: basic wrapper around MarianMT model for language translation
    """

    def __init__(self, model_name=None, device=None, quantize=False):
        """
        ```
        basic wrapper around MarianMT model for language translation

        Args:
          model_name(str): Helsinki-NLP model
          device(str): device to use (e.g., 'cuda', 'cpu')
          quantize(bool): If True, use quantization.
        ```
        """
        if 'Helsinki-NLP' not in model_name:
            warnings.warn('Translator requires a Helsinki-NLP model: https://huggingface.co/Helsinki-NLP')
        super().__init__(device=device, quantize=quantize)
        from transformers import MarianMTModel, MarianTokenizer
        self.tokenizer = MarianTokenizer.from_pretrained(model_name)
        self.model = MarianMTModel.from_pretrained(model_name).to(self.torch_device)
        if quantize: self.model = self.quantize_model(self.model)


    def translate(self, src_text, join_with='\n', num_beams=None, early_stopping=None):
        """
        ```
        Translate document (src_text).
        To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
        Args:
          src_text(str): source text.
                         The source text can either be a single sentence or an entire document with multiple sentences
                         and paragraphs. 
                         IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                         If the input text is very large (e.g., an entire book), you should
                                         break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and 
                                         feed each chunk separately into translate to avoid out-of-memory issues.
          join_with(str):  list of translated sentences will be delimited with this character.
                           default: each sentence on separate line
          num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1, 
                          whicn means no beam search.
          early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences 
                                 are finished per batch or not. Defaults to None.  If None, the transformers library
                                 sets this to False.
        Returns:
          str: translated text
        ```
        """
        sentences = TU.sent_tokenize(src_text)
        tgt_sentences = self.translate_sentences(sentences, num_beams=num_beams, early_stopping=early_stopping)
        return join_with.join(tgt_sentences)


    def translate_sentences(self, sentences, num_beams=None, early_stopping=None):
        """
        ```
        Translate sentences using model_name as model.
        To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
        Args:
          sentences(list): list of strings representing sentences that need to be translated
                         IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                         If the input text is very large (e.g., an entire book), you should
                                         break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and 
                                         feed each chunk separately into translate to avoid out-of-memory issues.
          num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1, 
                          whicn means no beam search.
          early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences 
                                 are finished per batch or not. Defaults to None.  If None, the transformers library
                                 sets this to False.
        Returns:
          str: translated sentences
        ```
        """
        import torch
        with torch.no_grad():
            translated = self.model.generate(**self.tokenizer.prepare_seq2seq_batch(sentences, return_tensors='pt').to(self.torch_device), 
                                             num_beams=num_beams, early_stopping=early_stopping)
            tgt_sentences = [self.tokenizer.decode(t, skip_special_tokens=True) for t in translated]
        return tgt_sentences



class EnglishTranslator():
    """
    Class to translate text in various languages to English.
    """

    def __init__(self, src_lang=None, device=None, quantize=False):
        """
        ```
        Constructor for English translator

        Args:
          src_lang(str): language code of source language.
                         Must be one of SUPPORTED_SRC_LANGS:
                           'zh': Chinese (either tradtional or simplified)
                           'ar': Arabic
                           'ru' : Russian
                           'de': German
                           'af': Afrikaans
                           'es': Spanish
                           'fr': French
                           'it': Italian
                           'pt': Portuguese
          device(str): device to use (e.g., 'cuda', 'cpu')
          quantize(bool): If True, use quantization.
        ```
        """

        if src_lang is None or src_lang not in SUPPORTED_SRC_LANGS:
            raise ValueError('A src_lang must be supplied and be one of: %s' % (SUPPORED_SRC_LANG))
        self.src_lang = src_lang
        self.translators = []
        if src_lang == 'ar':
            self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-ar-en', device=device, quantize=quantize))
        elif src_lang == 'ru':
            self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-ru-en', device=device, quantize=quantize))
        elif src_lang == 'de':
            self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-de-en', device=device, quantize=quantize))
        elif src_lang == 'af':
            self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-af-en', device=device, quantize=quantize))
        elif src_lang in ['es', 'fr', 'it', 'pt']:
            self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-ROMANCE-en', device=device, quantize=quantize))
        #elif src_lang == 'zh': # could not find zh->en model, so currently doing two-step translation to English via German
            #self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-ZH-de', device=device))
            #self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-de-en', device=device))
        elif src_lang == 'zh':
            self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-zh-en', device=device, quantize=quantize))
        else:
            raise ValueError('lang:%s is currently not supported.' % (src_lang))


    def translate(self, src_text, join_with='\n', num_beams=None, early_stopping=None):
        """
        ```
        Translate source document to English.
        To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).

        Args:
          src_text(str): source text. Must be in language specified by src_lang (language code) supplied to constructor
                         The source text can either be a single sentence or an entire document with multiple sentences
                         and paragraphs. 
                         IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                         If the input text is very large (e.g., an entire book), you should
                                         break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and 
                                         feed each chunk separately into translate to avoid out-of-memory issues.
          join_with(str):  list of translated sentences will be delimited with this character.
                           default: each sentence on separate line
          num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1, 
                          whicn means no beam search.
          early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences 
                                 are finished per batch or not. Defaults to None.  If None, the transformers library
                                 sets this to False.
        Returns:
          str: translated text
        ```
        """
        text = src_text
        for t in self.translators:
             text = t.translate(text, join_with=join_with, num_beams=num_beams, early_stopping=early_stopping)
        return text
            

Classes

class EnglishTranslator (src_lang=None, device=None, quantize=False)

Class to translate text in various languages to English.

Constructor for English translator

Args:
  src_lang(str): language code of source language.
                 Must be one of SUPPORTED_SRC_LANGS:
                   'zh': Chinese (either tradtional or simplified)
                   'ar': Arabic
                   'ru' : Russian
                   'de': German
                   'af': Afrikaans
                   'es': Spanish
                   'fr': French
                   'it': Italian
                   'pt': Portuguese
  device(str): device to use (e.g., 'cuda', 'cpu')
  quantize(bool): If True, use quantization.
Expand source code
class EnglishTranslator():
    """
    Class to translate text in various languages to English.
    """

    def __init__(self, src_lang=None, device=None, quantize=False):
        """
        ```
        Constructor for English translator

        Args:
          src_lang(str): language code of source language.
                         Must be one of SUPPORTED_SRC_LANGS:
                           'zh': Chinese (either tradtional or simplified)
                           'ar': Arabic
                           'ru' : Russian
                           'de': German
                           'af': Afrikaans
                           'es': Spanish
                           'fr': French
                           'it': Italian
                           'pt': Portuguese
          device(str): device to use (e.g., 'cuda', 'cpu')
          quantize(bool): If True, use quantization.
        ```
        """

        if src_lang is None or src_lang not in SUPPORTED_SRC_LANGS:
            raise ValueError('A src_lang must be supplied and be one of: %s' % (SUPPORED_SRC_LANG))
        self.src_lang = src_lang
        self.translators = []
        if src_lang == 'ar':
            self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-ar-en', device=device, quantize=quantize))
        elif src_lang == 'ru':
            self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-ru-en', device=device, quantize=quantize))
        elif src_lang == 'de':
            self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-de-en', device=device, quantize=quantize))
        elif src_lang == 'af':
            self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-af-en', device=device, quantize=quantize))
        elif src_lang in ['es', 'fr', 'it', 'pt']:
            self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-ROMANCE-en', device=device, quantize=quantize))
        #elif src_lang == 'zh': # could not find zh->en model, so currently doing two-step translation to English via German
            #self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-ZH-de', device=device))
            #self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-de-en', device=device))
        elif src_lang == 'zh':
            self.translators.append(Translator(model_name='Helsinki-NLP/opus-mt-zh-en', device=device, quantize=quantize))
        else:
            raise ValueError('lang:%s is currently not supported.' % (src_lang))


    def translate(self, src_text, join_with='\n', num_beams=None, early_stopping=None):
        """
        ```
        Translate source document to English.
        To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).

        Args:
          src_text(str): source text. Must be in language specified by src_lang (language code) supplied to constructor
                         The source text can either be a single sentence or an entire document with multiple sentences
                         and paragraphs. 
                         IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                         If the input text is very large (e.g., an entire book), you should
                                         break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and 
                                         feed each chunk separately into translate to avoid out-of-memory issues.
          join_with(str):  list of translated sentences will be delimited with this character.
                           default: each sentence on separate line
          num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1, 
                          whicn means no beam search.
          early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences 
                                 are finished per batch or not. Defaults to None.  If None, the transformers library
                                 sets this to False.
        Returns:
          str: translated text
        ```
        """
        text = src_text
        for t in self.translators:
             text = t.translate(text, join_with=join_with, num_beams=num_beams, early_stopping=early_stopping)
        return text

Methods

def translate(self, src_text, join_with='\n', num_beams=None, early_stopping=None)
Translate source document to English.
To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).

Args:
  src_text(str): source text. Must be in language specified by src_lang (language code) supplied to constructor
                 The source text can either be a single sentence or an entire document with multiple sentences
                 and paragraphs. 
                 IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                 If the input text is very large (e.g., an entire book), you should
                                 break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and 
                                 feed each chunk separately into translate to avoid out-of-memory issues.
  join_with(str):  list of translated sentences will be delimited with this character.
                   default: each sentence on separate line
  num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1, 
                  whicn means no beam search.
  early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences 
                         are finished per batch or not. Defaults to None.  If None, the transformers library
                         sets this to False.
Returns:
  str: translated text
Expand source code
def translate(self, src_text, join_with='\n', num_beams=None, early_stopping=None):
    """
    ```
    Translate source document to English.
    To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).

    Args:
      src_text(str): source text. Must be in language specified by src_lang (language code) supplied to constructor
                     The source text can either be a single sentence or an entire document with multiple sentences
                     and paragraphs. 
                     IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                     If the input text is very large (e.g., an entire book), you should
                                     break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and 
                                     feed each chunk separately into translate to avoid out-of-memory issues.
      join_with(str):  list of translated sentences will be delimited with this character.
                       default: each sentence on separate line
      num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1, 
                      whicn means no beam search.
      early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences 
                             are finished per batch or not. Defaults to None.  If None, the transformers library
                             sets this to False.
    Returns:
      str: translated text
    ```
    """
    text = src_text
    for t in self.translators:
         text = t.translate(text, join_with=join_with, num_beams=num_beams, early_stopping=early_stopping)
    return text
class Translator (model_name=None, device=None, quantize=False)

Translator: basic wrapper around MarianMT model for language translation

basic wrapper around MarianMT model for language translation

Args:
  model_name(str): Helsinki-NLP model
  device(str): device to use (e.g., 'cuda', 'cpu')
  quantize(bool): If True, use quantization.
Expand source code
class Translator(TorchBase):
    """
    Translator: basic wrapper around MarianMT model for language translation
    """

    def __init__(self, model_name=None, device=None, quantize=False):
        """
        ```
        basic wrapper around MarianMT model for language translation

        Args:
          model_name(str): Helsinki-NLP model
          device(str): device to use (e.g., 'cuda', 'cpu')
          quantize(bool): If True, use quantization.
        ```
        """
        if 'Helsinki-NLP' not in model_name:
            warnings.warn('Translator requires a Helsinki-NLP model: https://huggingface.co/Helsinki-NLP')
        super().__init__(device=device, quantize=quantize)
        from transformers import MarianMTModel, MarianTokenizer
        self.tokenizer = MarianTokenizer.from_pretrained(model_name)
        self.model = MarianMTModel.from_pretrained(model_name).to(self.torch_device)
        if quantize: self.model = self.quantize_model(self.model)


    def translate(self, src_text, join_with='\n', num_beams=None, early_stopping=None):
        """
        ```
        Translate document (src_text).
        To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
        Args:
          src_text(str): source text.
                         The source text can either be a single sentence or an entire document with multiple sentences
                         and paragraphs. 
                         IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                         If the input text is very large (e.g., an entire book), you should
                                         break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and 
                                         feed each chunk separately into translate to avoid out-of-memory issues.
          join_with(str):  list of translated sentences will be delimited with this character.
                           default: each sentence on separate line
          num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1, 
                          whicn means no beam search.
          early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences 
                                 are finished per batch or not. Defaults to None.  If None, the transformers library
                                 sets this to False.
        Returns:
          str: translated text
        ```
        """
        sentences = TU.sent_tokenize(src_text)
        tgt_sentences = self.translate_sentences(sentences, num_beams=num_beams, early_stopping=early_stopping)
        return join_with.join(tgt_sentences)


    def translate_sentences(self, sentences, num_beams=None, early_stopping=None):
        """
        ```
        Translate sentences using model_name as model.
        To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
        Args:
          sentences(list): list of strings representing sentences that need to be translated
                         IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                         If the input text is very large (e.g., an entire book), you should
                                         break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and 
                                         feed each chunk separately into translate to avoid out-of-memory issues.
          num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1, 
                          whicn means no beam search.
          early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences 
                                 are finished per batch or not. Defaults to None.  If None, the transformers library
                                 sets this to False.
        Returns:
          str: translated sentences
        ```
        """
        import torch
        with torch.no_grad():
            translated = self.model.generate(**self.tokenizer.prepare_seq2seq_batch(sentences, return_tensors='pt').to(self.torch_device), 
                                             num_beams=num_beams, early_stopping=early_stopping)
            tgt_sentences = [self.tokenizer.decode(t, skip_special_tokens=True) for t in translated]
        return tgt_sentences

Ancestors

Methods

def translate(self, src_text, join_with='\n', num_beams=None, early_stopping=None)
Translate document (src_text).
To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
Args:
  src_text(str): source text.
                 The source text can either be a single sentence or an entire document with multiple sentences
                 and paragraphs. 
                 IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                 If the input text is very large (e.g., an entire book), you should
                                 break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and 
                                 feed each chunk separately into translate to avoid out-of-memory issues.
  join_with(str):  list of translated sentences will be delimited with this character.
                   default: each sentence on separate line
  num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1, 
                  whicn means no beam search.
  early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences 
                         are finished per batch or not. Defaults to None.  If None, the transformers library
                         sets this to False.
Returns:
  str: translated text
Expand source code
def translate(self, src_text, join_with='\n', num_beams=None, early_stopping=None):
    """
    ```
    Translate document (src_text).
    To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
    Args:
      src_text(str): source text.
                     The source text can either be a single sentence or an entire document with multiple sentences
                     and paragraphs. 
                     IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                     If the input text is very large (e.g., an entire book), you should
                                     break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and 
                                     feed each chunk separately into translate to avoid out-of-memory issues.
      join_with(str):  list of translated sentences will be delimited with this character.
                       default: each sentence on separate line
      num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1, 
                      whicn means no beam search.
      early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences 
                             are finished per batch or not. Defaults to None.  If None, the transformers library
                             sets this to False.
    Returns:
      str: translated text
    ```
    """
    sentences = TU.sent_tokenize(src_text)
    tgt_sentences = self.translate_sentences(sentences, num_beams=num_beams, early_stopping=early_stopping)
    return join_with.join(tgt_sentences)
def translate_sentences(self, sentences, num_beams=None, early_stopping=None)
Translate sentences using model_name as model.
To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
Args:
  sentences(list): list of strings representing sentences that need to be translated
                 IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                 If the input text is very large (e.g., an entire book), you should
                                 break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and 
                                 feed each chunk separately into translate to avoid out-of-memory issues.
  num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1, 
                  whicn means no beam search.
  early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences 
                         are finished per batch or not. Defaults to None.  If None, the transformers library
                         sets this to False.
Returns:
  str: translated sentences
Expand source code
def translate_sentences(self, sentences, num_beams=None, early_stopping=None):
    """
    ```
    Translate sentences using model_name as model.
    To speed up translations, you can set num_beams and early_stopping (e.g., num_beams=4, early_stopping=True).
    Args:
      sentences(list): list of strings representing sentences that need to be translated
                     IMPORTANT NOTE: Sentences are joined together and fed to model as single batch.
                                     If the input text is very large (e.g., an entire book), you should
                                     break it up into reasonbly-sized chunks (e.g., pages, paragraphs, or sentences) and 
                                     feed each chunk separately into translate to avoid out-of-memory issues.
      num_beams(int): Number of beams for beam search. Defaults to None.  If None, the transformers library defaults this to 1, 
                      whicn means no beam search.
      early_stopping(bool):  Whether to stop the beam search when at least ``num_beams`` sentences 
                             are finished per batch or not. Defaults to None.  If None, the transformers library
                             sets this to False.
    Returns:
      str: translated sentences
    ```
    """
    import torch
    with torch.no_grad():
        translated = self.model.generate(**self.tokenizer.prepare_seq2seq_batch(sentences, return_tensors='pt').to(self.torch_device), 
                                         num_beams=num_beams, early_stopping=early_stopping)
        tgt_sentences = [self.tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    return tgt_sentences

Inherited members