Module ktrain.text.textextractor
Expand source code
from ..imports import *
from . import textutils as TU
class TextExtractor:
"""
```
Text Extractor: a wrapper to textract package
```
"""
def __init__(self):
try:
import textract
except ImportError:
raise Exception('TextExtractor requires textract: pip install textract')
self.process = textract.process
def extract(self, filename=None, text=None,return_format='document', lang=None, verbose=1):
"""
```
Extracts text from document given file path to document.
filename(str): path to file, Mutually-exclusive with text.
text(str): string to tokenize. Mutually-exclusive with filename.
The extract method can also simply accept a string and return lists of sentences or paragraphs.
return_format(str): One of {'document', 'paragraphs', 'sentences'}
'document': returns text of document
'paragraphs': returns a list of paragraphs from document
'sentences': returns a list of sentences from document
lang(str): language code. If None, lang will be detected from extracted text
verbose(bool): verbosity
```
"""
if filename is None and text is None:
raise ValueError('Either the filename parameter or the text parameter must be supplied')
if filename is not None and text is not None:
raise ValueError('The filename and text parameters are mutually-exclusive.')
if return_format not in ['document', 'paragraphs', 'sentences']:
raise ValueError('return_format must be one of {"document", "paragraphs", "sentences"}')
if filename is not None:
mtype = TU.get_mimetype(filename)
try:
if mtype and mtype.split('/')[0] == 'text':
with open(filename, 'r') as f:
text = f.read()
text = str.encode(text)
else:
text = self.process(filename)
except Exception as e:
if verbose:
print('ERROR on %s:\n%s' % (filename, e))
try:
text = text.decode(errors='ignore')
except:
pass
if return_format == 'sentences':
return TU.sent_tokenize(text, lang=lang)
elif return_format == 'paragraphs':
return TU.paragraph_tokenize(text, join_sentences=True, lang=lang)
else:
return text
Classes
class TextExtractor
-
Text Extractor: a wrapper to textract package
Expand source code
class TextExtractor: """ ``` Text Extractor: a wrapper to textract package ``` """ def __init__(self): try: import textract except ImportError: raise Exception('TextExtractor requires textract: pip install textract') self.process = textract.process def extract(self, filename=None, text=None,return_format='document', lang=None, verbose=1): """ ``` Extracts text from document given file path to document. filename(str): path to file, Mutually-exclusive with text. text(str): string to tokenize. Mutually-exclusive with filename. The extract method can also simply accept a string and return lists of sentences or paragraphs. return_format(str): One of {'document', 'paragraphs', 'sentences'} 'document': returns text of document 'paragraphs': returns a list of paragraphs from document 'sentences': returns a list of sentences from document lang(str): language code. If None, lang will be detected from extracted text verbose(bool): verbosity ``` """ if filename is None and text is None: raise ValueError('Either the filename parameter or the text parameter must be supplied') if filename is not None and text is not None: raise ValueError('The filename and text parameters are mutually-exclusive.') if return_format not in ['document', 'paragraphs', 'sentences']: raise ValueError('return_format must be one of {"document", "paragraphs", "sentences"}') if filename is not None: mtype = TU.get_mimetype(filename) try: if mtype and mtype.split('/')[0] == 'text': with open(filename, 'r') as f: text = f.read() text = str.encode(text) else: text = self.process(filename) except Exception as e: if verbose: print('ERROR on %s:\n%s' % (filename, e)) try: text = text.decode(errors='ignore') except: pass if return_format == 'sentences': return TU.sent_tokenize(text, lang=lang) elif return_format == 'paragraphs': return TU.paragraph_tokenize(text, join_sentences=True, lang=lang) else: return text
Methods
def extract(self, filename=None, text=None, return_format='document', lang=None, verbose=1)
-
Extracts text from document given file path to document. filename(str): path to file, Mutually-exclusive with text. text(str): string to tokenize. Mutually-exclusive with filename. The extract method can also simply accept a string and return lists of sentences or paragraphs. return_format(str): One of {'document', 'paragraphs', 'sentences'} 'document': returns text of document 'paragraphs': returns a list of paragraphs from document 'sentences': returns a list of sentences from document lang(str): language code. If None, lang will be detected from extracted text verbose(bool): verbosity
Expand source code
def extract(self, filename=None, text=None,return_format='document', lang=None, verbose=1): """ ``` Extracts text from document given file path to document. filename(str): path to file, Mutually-exclusive with text. text(str): string to tokenize. Mutually-exclusive with filename. The extract method can also simply accept a string and return lists of sentences or paragraphs. return_format(str): One of {'document', 'paragraphs', 'sentences'} 'document': returns text of document 'paragraphs': returns a list of paragraphs from document 'sentences': returns a list of sentences from document lang(str): language code. If None, lang will be detected from extracted text verbose(bool): verbosity ``` """ if filename is None and text is None: raise ValueError('Either the filename parameter or the text parameter must be supplied') if filename is not None and text is not None: raise ValueError('The filename and text parameters are mutually-exclusive.') if return_format not in ['document', 'paragraphs', 'sentences']: raise ValueError('return_format must be one of {"document", "paragraphs", "sentences"}') if filename is not None: mtype = TU.get_mimetype(filename) try: if mtype and mtype.split('/')[0] == 'text': with open(filename, 'r') as f: text = f.read() text = str.encode(text) else: text = self.process(filename) except Exception as e: if verbose: print('ERROR on %s:\n%s' % (filename, e)) try: text = text.decode(errors='ignore') except: pass if return_format == 'sentences': return TU.sent_tokenize(text, lang=lang) elif return_format == 'paragraphs': return TU.paragraph_tokenize(text, join_sentences=True, lang=lang) else: return text