Module ktrain.text.textutils

Expand source code
from ..imports import *
from subprocess import Popen, PIPE, DEVNULL


DEFAULT_TOKEN_PATTERN = (r"\b[a-zA-Z][a-zA-Z0-9]*(?:[_/&-][a-zA-Z0-9]+)+\b|"
                         r"\b\d*[a-zA-Z][a-zA-Z0-9][a-zA-Z0-9]+\b")



def extract_copy(corpus_path, output_path, verbose=0):
    """
    ```
    Crawl <corpus_path>, extract plain text from documents
    and then copy them to output_path.
    Requires textract package
    Args:
        corpus_path(str):  root folder containing documents
        output_path(str):  root folder of output directory
        verbose(bool):  Default:0.  Set to 1 (or True) to see error details on why each skipped document was skipped.
    Returns:
        list: list of skipped filenames
    ```
    """
    try:
        # TODO: change this to use TextExtractor
        import textract
    except ImportError:
        raise Exception('extract_copy requires textract: pip install textract')

    skipped = set()
    num_skipped = 0
    corpus_path = os.path.normpath(corpus_path)
    output_path = os.path.normpath(output_path)
    for idx, filename in enumerate(extract_filenames(corpus_path)):
        if idx %1000 == 0: print('processed %s doc(s)' % (idx+1))
        mtype = get_mimetype(filename)
        try:
            if mtype and mtype.split('/')[0] == 'text':
                with open(filename, 'r') as f:
                    text = f.read()
                    text = str.encode(text)
            else:
                text = textract.process(filename)
        except Exception as e:
            if verbose:
                print('ERROR on %s:\n%s' % (filename, e))
            num_skipped += 1
            if not mtype:
                mtype =  os.path.splitext(filename)[1]
                if not mtype: mtype == 'unknown'
            skipped.add(mtype)
            continue

        if not text: 
            num_skipped += 1
            continue
        fpath, fname = os.path.split(filename)
        if mtype and mtype.split('/')[0] != 'text': fname = fname+'.txt'
        relfpath = fpath.replace(corpus_path, '')
        relfpath = relfpath[1:] if relfpath and relfpath[0] == os.sep else relfpath
        opath = os.path.join(output_path, relfpath)
        if not os.path.exists(opath):
            os.makedirs(opath)
        ofilename = os.path.join(opath, fname)
        with open(ofilename, 'wb') as f:
            f.write(text)
    print('processed %s docs' % (idx+1))
    print('done.')
    print('skipped %s docs' % (num_skipped))
    if skipped: print('%s' %(skipped))


def get_mimetype(filepath):
    return mimetypes.guess_type(filepath)[0]

def is_txt(filepath, strict=False):
    if strict:
        return mimetypes.guess_type(filepath)[0] == 'text/plain'
    else:
        mtype = get_mimetype(filepath)
        return mtype is not None and mtype.split('/')[0] == 'text'


def is_pdf(filepath):
    return mimetypes.guess_type(filepath)[0] == 'application/pdf'



def pdftotext(filename):
    """
    ```
    Use pdftotext program to convert PDF to text string.
    :param filename: of PDF file
    :return: text from file, or empty string if failure
    ```
    """
    output = Popen(['pdftotext', '-q', filename, '-'],
                   stdout=PIPE).communicate()[0]
    # None may indicate damage, but convert for consistency
    return '' if output is None else output



def requires_ocr(filename):
    """
    ```
    Uses pdffonts program to determine if the PDF requires OCR, i.e., it
    doesn't contain any fonts.
    :param filename: of PDF file
    :return: True if requires OCR, False if not
    ```
    """
    output = Popen(['pdffonts', filename], stdout=PIPE,
                   stderr=DEVNULL).communicate()[0]
    return len(output.split('\n')) < 4


def extract_filenames(corpus_path, follow_links=False):
    if os.listdir(corpus_path) == []:
        raise ValueError("%s: path is empty" % corpus_path)
    walk = os.walk
    for root, dirs, filenames in walk(corpus_path, followlinks=follow_links):
        for filename in filenames:
            try:
                yield os.path.join(root, filename)
            except:
                continue


def strip_control_characters(data):
    if data:
        # unicode invalid characters
        re_xml_illegal = (
            '([\u0000-\u0008\u000b-\u000c\u000e-\u001f\ufffe-\uffff])|'
            '([%s-%s][^%s-%s])|([^%s-%s][%s-%s])|([%s-%s]$)|(^[%s-%s])'
            % (chr(0xd800), chr(0xdbff), chr(0xdc00), chr(0xdfff), chr(0xd800),
               chr(0xdbff), chr(0xdc00), chr(0xdfff), chr(0xd800), chr(0xdbff),
               chr(0xdc00), chr(0xdfff))
        )
        data = re.sub(re_xml_illegal, "", data)
        # ascii control characters
        #data = re.sub(r"[\x01-\x1F\x7F]", "", data)
        # See:  http://w3.org/International/questions/qa-forms-utf-8.html
        # Printable utf-8 does not include any of these chars below x7F
        data = re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F]", "", data)
    return data



def to_ascii(data):
    """Transform accentuated unicode symbols into ascii or nothing

    Warning: this solution is only suited for languages that have a direct
    transliteration to ASCII symbols.

    A better solution would be to use transliteration based on a precomputed
    unidecode map to be used by translate as explained here:

        http://stackoverflow.com/questions/2854230/

    """
    import unicodedata
    if isinstance(data, bytes):
        data = data.decode()
    nkfd_form = unicodedata.normalize('NFKD', data)
    only_ascii = nkfd_form.encode('ASCII', 'ignore')

    # Return a string
    return only_ascii.decode('ascii')



def load_text_files(corpus_path, truncate_len=None, 
                    clean=True, return_fnames=False):
    """
    ```
    load text files
    ```
    """
    
    texts = []
    filenames = []
    mb = master_bar(range(1))
    for i in mb:
        for filename in progress_bar(list(extract_filenames(corpus_path)), parent=mb):
            with open(filename, 'r') as f:
                text = f.read()
            if clean:
                text = strip_control_characters(text)
                text = to_ascii(text)
            if truncate_len is not None:
                text = " ".join(text.split()[:truncate_len])
            texts.append(text)
            filenames.append(filename)
        mb.write('done.')
    if return_fnames:
        return (texts, filenames)
    else:
        return texts


def filter_by_id(lst, ids=[]):
    """
    ```
    filter list by supplied IDs
    ```
    """
    return [x for i,x in enumerate(lst) if i in ids]


#------------------------------------------------------------------------------
# Language-Handling
#------------------------------------------------------------------------------


def detect_lang(texts, sample_size=32):
    """
    ```
    detect language
    ```
    """

    # convert sentence pairs
    if isinstance(texts, (tuple, list, np.ndarray)) and len(texts) == 2:
        texts = [texts[0], texts[1]]
    elif isinstance(texts, (tuple, list, np.ndarray)) and isinstance(texts[0], (tuple, list, np.ndarray)) and len(texts[0]) == 2:
        texts = [t[0] for t in texts]

    if isinstance(texts, (pd.Series, pd.DataFrame)):
        texts = texts.values
    if isinstance(texts, str): texts = [texts]
    if not isinstance(texts, (list, np.ndarray)):
        raise ValueError('texts must be a list or NumPy array of strings')
    lst = []
    for doc in texts[:sample_size]:
        try:
            lst.append(langdetect.detect(doc))
        except:
            continue
    if len(lst) == 0: 
        warnings.warn('Defaulting to English for language detection: could not detect language from documents. '+\
                      'This may be due to empty or invalid texts being provided to detect_lang.')
        lang = 'en'
    else:
        lang = max(set(lst), key=lst.count)
    #return max(set(lst), key=lst.count)
    return lang



def is_chinese(lang, strict=True):
    """
    ```
    Args:
      lang(str): language code (e.g., en)
      strict(bool):  If False, include additional languages due to mistakes on short texts by langdetect
    ```
    """
    if strict:
        extra_clause = False
    else:
        extra_clause = lang in ['ja', 'ko']
    return lang is not None and lang.startswith('zh-') or extra_clause


def split_chinese(texts):
    if isinstance(texts, str): texts=[texts]

    split_texts = []
    for doc in texts:
        seg_list = jieba.cut(doc, cut_all=False)
        seg_list = list(seg_list)
        split_texts.append(seg_list)
    return [" ".join(tokens) for tokens in split_texts]


NOSPACE_LANGS = ['zh-cn', 'zh-tw', 'ja']


def is_nospace_lang(lang):
    return lang in NOSPACE_LANGS


def decode_by_line(texts, encoding='utf-8', verbose=1):
    """
    ```
    Decode text line by line and skip over errors.
    ```
    """

    if isinstance(texts, str): texts = [texts]
    new_texts = []
    skips=0
    num_lines = 0
    for doc in texts:
        text = ""
        for line in doc.splitlines():
            num_lines +=1
            try:
                line = line.decode(encoding)
            except:
                skips +=1
                continue
            text += line
        new_texts.append(text)
    pct = round((skips*1./num_lines) * 100, 1)
    if verbose:
        print('skipped %s lines (%s%%) due to character decoding errors' % (skips, pct))
        if pct > 10:
            print('If this is too many, try a different encoding')
    return new_texts


def detect_encoding(texts, sample_size=32):
    if not isinstance(texts, list): texts = [texts] # check for instance of list as bytes are supplied as input
    lst = [chardet.detect(doc)['encoding'] for doc in texts[:sample_size]]
    encoding = max(set(lst), key=lst.count)
    # standardize to utf-8 to prevent BERT problems
    encoding = 'utf-8' if encoding.lower() in ['ascii', 'utf8', 'utf-8'] else encoding
    return encoding


def read_text(filename):
    with open(filename, 'rb') as f:
        text = f.read()
    encoding = detect_encoding([text])
    try:
        decoded_text = text.decode(encoding)
    except:
        U.vprint('Decoding with %s failed 1st attempt - using %s with skips' % (encoding,
                                                                                encoding),
                                                                                verbose=verbose)
        decoded_text = decode_by_line(text, encoding=encoding)
    return decoded_text.strip()


#tokenizer_filter = rs='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s, join_tokens=False, join_char=' '): 
    tokens = re_tok.sub(r' \1 ', s).split()
    if join_tokens: tokens = join_char.join(tokens)
    return tokens



def sent_tokenize(text, lang=None):
    """
    ```
    segment text into sentences
    ```
    """
    lang = detect_lang(text) if lang is None else lang
    sents = []
    if is_chinese(lang):
        for sent in re.findall(u'[^!?。\.\!\?]+[!?。\.\!\?]?', text, flags=re.U):
            sents.append(sent)
    else:
        for paragraph in segmenter.process(text):
            for sentence in paragraph:
                sents.append(" ".join([t.value for t in sentence]))
    return sents



def paragraph_tokenize(text, join_sentences=False, lang=None):
    """
    ```
    segment text into paragraphs
    ```
    """
    lang = detect_lang(text) if lang is None else lang
    if is_chinese(lang):
        raise ValueError('paragraph_tokenize does not currently support Chinese.')
    paragraphs = []
    sents = []
    for paragraph in segmenter.process(text):
        sents = []
        for sentence in paragraph:
            sents.append(" ".join([t.value for t in sentence]))
        if join_sentences: sents = ' '.join(sents)
        paragraphs.append(sents)
    return paragraphs


def extract_noun_phrases(text):
    """
    ```
    extracts noun phrases
    ```
    """
    try:
        from textblob import TextBlob
    except:
        raise Exception('extract_noun_phrases require TextBlob: pip install textblob')
    blob = TextBlob(text)
    stop_words = ['which', 'what']
    curr_phrase = []
    np_list = []
    start = False
    for token in blob.tags:
        if token[1].startswith('J') or token[1].startswith('N'):
            if not start: start = True
            if token[0].lower() not in stop_words: curr_phrase.append(token[0])
        else:
            if start:
                np_list.append(" ".join(curr_phrase))
                curr_phrase = []
                start = False
    if start: np_list.append(" ".join(curr_phrase))
    return np_list


def extract_offsets(sentence, tokens=None, tokenizer=tokenize):
    """
    ```
    extracts character

    Args:
      sentence (str): text
      tokens (list): list of tokens from sentence.  If None, tokens will be generated using supplied tokenizer.
      tokenizer (Callable):  a callable that accepts text and returns a list of tokens
    Return:
      list of dictionaries of the form {'token': <the token>, 'start': start character index, 'end': end character index}
    ```
    """
    s = sentence
    tokens = tokenizer(sentence)
    offsets = []
    last_end = 0
    for t in tokens:
        # find start of current token
        for start_ind in range(last_end, len(sentence)):
            if sentence[start_ind] == t[0]: break
        end_ind = len(sentence)
        for end_ind in range(start_ind+1, len(sentence)):
            if (end_ind-start_ind) >= len(t): 
                break
        d = {'token': t, 
             'start' : start_ind, 'end' : end_ind,
            }
        offsets.append(d)
        last_end = end_ind
    return offsets

Functions

def decode_by_line(texts, encoding='utf-8', verbose=1)
Decode text line by line and skip over errors.
Expand source code
def decode_by_line(texts, encoding='utf-8', verbose=1):
    """
    ```
    Decode text line by line and skip over errors.
    ```
    """

    if isinstance(texts, str): texts = [texts]
    new_texts = []
    skips=0
    num_lines = 0
    for doc in texts:
        text = ""
        for line in doc.splitlines():
            num_lines +=1
            try:
                line = line.decode(encoding)
            except:
                skips +=1
                continue
            text += line
        new_texts.append(text)
    pct = round((skips*1./num_lines) * 100, 1)
    if verbose:
        print('skipped %s lines (%s%%) due to character decoding errors' % (skips, pct))
        if pct > 10:
            print('If this is too many, try a different encoding')
    return new_texts
def detect_encoding(texts, sample_size=32)
Expand source code
def detect_encoding(texts, sample_size=32):
    if not isinstance(texts, list): texts = [texts] # check for instance of list as bytes are supplied as input
    lst = [chardet.detect(doc)['encoding'] for doc in texts[:sample_size]]
    encoding = max(set(lst), key=lst.count)
    # standardize to utf-8 to prevent BERT problems
    encoding = 'utf-8' if encoding.lower() in ['ascii', 'utf8', 'utf-8'] else encoding
    return encoding
def detect_lang(texts, sample_size=32)
detect language
Expand source code
def detect_lang(texts, sample_size=32):
    """
    ```
    detect language
    ```
    """

    # convert sentence pairs
    if isinstance(texts, (tuple, list, np.ndarray)) and len(texts) == 2:
        texts = [texts[0], texts[1]]
    elif isinstance(texts, (tuple, list, np.ndarray)) and isinstance(texts[0], (tuple, list, np.ndarray)) and len(texts[0]) == 2:
        texts = [t[0] for t in texts]

    if isinstance(texts, (pd.Series, pd.DataFrame)):
        texts = texts.values
    if isinstance(texts, str): texts = [texts]
    if not isinstance(texts, (list, np.ndarray)):
        raise ValueError('texts must be a list or NumPy array of strings')
    lst = []
    for doc in texts[:sample_size]:
        try:
            lst.append(langdetect.detect(doc))
        except:
            continue
    if len(lst) == 0: 
        warnings.warn('Defaulting to English for language detection: could not detect language from documents. '+\
                      'This may be due to empty or invalid texts being provided to detect_lang.')
        lang = 'en'
    else:
        lang = max(set(lst), key=lst.count)
    #return max(set(lst), key=lst.count)
    return lang
def extract_copy(corpus_path, output_path, verbose=0)
Crawl <corpus_path>, extract plain text from documents
and then copy them to output_path.
Requires textract package
Args:
    corpus_path(str):  root folder containing documents
    output_path(str):  root folder of output directory
    verbose(bool):  Default:0.  Set to 1 (or True) to see error details on why each skipped document was skipped.
Returns:
    list: list of skipped filenames
Expand source code
def extract_copy(corpus_path, output_path, verbose=0):
    """
    ```
    Crawl <corpus_path>, extract plain text from documents
    and then copy them to output_path.
    Requires textract package
    Args:
        corpus_path(str):  root folder containing documents
        output_path(str):  root folder of output directory
        verbose(bool):  Default:0.  Set to 1 (or True) to see error details on why each skipped document was skipped.
    Returns:
        list: list of skipped filenames
    ```
    """
    try:
        # TODO: change this to use TextExtractor
        import textract
    except ImportError:
        raise Exception('extract_copy requires textract: pip install textract')

    skipped = set()
    num_skipped = 0
    corpus_path = os.path.normpath(corpus_path)
    output_path = os.path.normpath(output_path)
    for idx, filename in enumerate(extract_filenames(corpus_path)):
        if idx %1000 == 0: print('processed %s doc(s)' % (idx+1))
        mtype = get_mimetype(filename)
        try:
            if mtype and mtype.split('/')[0] == 'text':
                with open(filename, 'r') as f:
                    text = f.read()
                    text = str.encode(text)
            else:
                text = textract.process(filename)
        except Exception as e:
            if verbose:
                print('ERROR on %s:\n%s' % (filename, e))
            num_skipped += 1
            if not mtype:
                mtype =  os.path.splitext(filename)[1]
                if not mtype: mtype == 'unknown'
            skipped.add(mtype)
            continue

        if not text: 
            num_skipped += 1
            continue
        fpath, fname = os.path.split(filename)
        if mtype and mtype.split('/')[0] != 'text': fname = fname+'.txt'
        relfpath = fpath.replace(corpus_path, '')
        relfpath = relfpath[1:] if relfpath and relfpath[0] == os.sep else relfpath
        opath = os.path.join(output_path, relfpath)
        if not os.path.exists(opath):
            os.makedirs(opath)
        ofilename = os.path.join(opath, fname)
        with open(ofilename, 'wb') as f:
            f.write(text)
    print('processed %s docs' % (idx+1))
    print('done.')
    print('skipped %s docs' % (num_skipped))
    if skipped: print('%s' %(skipped))
def extract_filenames(corpus_path, follow_links=False)
Expand source code
def extract_filenames(corpus_path, follow_links=False):
    if os.listdir(corpus_path) == []:
        raise ValueError("%s: path is empty" % corpus_path)
    walk = os.walk
    for root, dirs, filenames in walk(corpus_path, followlinks=follow_links):
        for filename in filenames:
            try:
                yield os.path.join(root, filename)
            except:
                continue
def extract_noun_phrases(text)
extracts noun phrases
Expand source code
def extract_noun_phrases(text):
    """
    ```
    extracts noun phrases
    ```
    """
    try:
        from textblob import TextBlob
    except:
        raise Exception('extract_noun_phrases require TextBlob: pip install textblob')
    blob = TextBlob(text)
    stop_words = ['which', 'what']
    curr_phrase = []
    np_list = []
    start = False
    for token in blob.tags:
        if token[1].startswith('J') or token[1].startswith('N'):
            if not start: start = True
            if token[0].lower() not in stop_words: curr_phrase.append(token[0])
        else:
            if start:
                np_list.append(" ".join(curr_phrase))
                curr_phrase = []
                start = False
    if start: np_list.append(" ".join(curr_phrase))
    return np_list
def extract_offsets(sentence, tokens=None, tokenizer=<function tokenize>)
extracts character

Args:
  sentence (str): text
  tokens (list): list of tokens from sentence.  If None, tokens will be generated using supplied tokenizer.
  tokenizer (Callable):  a callable that accepts text and returns a list of tokens
Return:
  list of dictionaries of the form {'token': <the token>, 'start': start character index, 'end': end character index}
Expand source code
def extract_offsets(sentence, tokens=None, tokenizer=tokenize):
    """
    ```
    extracts character

    Args:
      sentence (str): text
      tokens (list): list of tokens from sentence.  If None, tokens will be generated using supplied tokenizer.
      tokenizer (Callable):  a callable that accepts text and returns a list of tokens
    Return:
      list of dictionaries of the form {'token': <the token>, 'start': start character index, 'end': end character index}
    ```
    """
    s = sentence
    tokens = tokenizer(sentence)
    offsets = []
    last_end = 0
    for t in tokens:
        # find start of current token
        for start_ind in range(last_end, len(sentence)):
            if sentence[start_ind] == t[0]: break
        end_ind = len(sentence)
        for end_ind in range(start_ind+1, len(sentence)):
            if (end_ind-start_ind) >= len(t): 
                break
        d = {'token': t, 
             'start' : start_ind, 'end' : end_ind,
            }
        offsets.append(d)
        last_end = end_ind
    return offsets
def filter_by_id(lst, ids=[])
filter list by supplied IDs
Expand source code
def filter_by_id(lst, ids=[]):
    """
    ```
    filter list by supplied IDs
    ```
    """
    return [x for i,x in enumerate(lst) if i in ids]
def get_mimetype(filepath)
Expand source code
def get_mimetype(filepath):
    return mimetypes.guess_type(filepath)[0]
def is_chinese(lang, strict=True)
Args:
  lang(str): language code (e.g., en)
  strict(bool):  If False, include additional languages due to mistakes on short texts by langdetect
Expand source code
def is_chinese(lang, strict=True):
    """
    ```
    Args:
      lang(str): language code (e.g., en)
      strict(bool):  If False, include additional languages due to mistakes on short texts by langdetect
    ```
    """
    if strict:
        extra_clause = False
    else:
        extra_clause = lang in ['ja', 'ko']
    return lang is not None and lang.startswith('zh-') or extra_clause
def is_nospace_lang(lang)
Expand source code
def is_nospace_lang(lang):
    return lang in NOSPACE_LANGS
def is_pdf(filepath)
Expand source code
def is_pdf(filepath):
    return mimetypes.guess_type(filepath)[0] == 'application/pdf'
def is_txt(filepath, strict=False)
Expand source code
def is_txt(filepath, strict=False):
    if strict:
        return mimetypes.guess_type(filepath)[0] == 'text/plain'
    else:
        mtype = get_mimetype(filepath)
        return mtype is not None and mtype.split('/')[0] == 'text'
def load_text_files(corpus_path, truncate_len=None, clean=True, return_fnames=False)
load text files
Expand source code
def load_text_files(corpus_path, truncate_len=None, 
                    clean=True, return_fnames=False):
    """
    ```
    load text files
    ```
    """
    
    texts = []
    filenames = []
    mb = master_bar(range(1))
    for i in mb:
        for filename in progress_bar(list(extract_filenames(corpus_path)), parent=mb):
            with open(filename, 'r') as f:
                text = f.read()
            if clean:
                text = strip_control_characters(text)
                text = to_ascii(text)
            if truncate_len is not None:
                text = " ".join(text.split()[:truncate_len])
            texts.append(text)
            filenames.append(filename)
        mb.write('done.')
    if return_fnames:
        return (texts, filenames)
    else:
        return texts
def paragraph_tokenize(text, join_sentences=False, lang=None)
segment text into paragraphs
Expand source code
def paragraph_tokenize(text, join_sentences=False, lang=None):
    """
    ```
    segment text into paragraphs
    ```
    """
    lang = detect_lang(text) if lang is None else lang
    if is_chinese(lang):
        raise ValueError('paragraph_tokenize does not currently support Chinese.')
    paragraphs = []
    sents = []
    for paragraph in segmenter.process(text):
        sents = []
        for sentence in paragraph:
            sents.append(" ".join([t.value for t in sentence]))
        if join_sentences: sents = ' '.join(sents)
        paragraphs.append(sents)
    return paragraphs
def pdftotext(filename)
Use pdftotext program to convert PDF to text string.
:param filename: of PDF file
:return: text from file, or empty string if failure
Expand source code
def pdftotext(filename):
    """
    ```
    Use pdftotext program to convert PDF to text string.
    :param filename: of PDF file
    :return: text from file, or empty string if failure
    ```
    """
    output = Popen(['pdftotext', '-q', filename, '-'],
                   stdout=PIPE).communicate()[0]
    # None may indicate damage, but convert for consistency
    return '' if output is None else output
def read_text(filename)
Expand source code
def read_text(filename):
    with open(filename, 'rb') as f:
        text = f.read()
    encoding = detect_encoding([text])
    try:
        decoded_text = text.decode(encoding)
    except:
        U.vprint('Decoding with %s failed 1st attempt - using %s with skips' % (encoding,
                                                                                encoding),
                                                                                verbose=verbose)
        decoded_text = decode_by_line(text, encoding=encoding)
    return decoded_text.strip()
def requires_ocr(filename)
Uses pdffonts program to determine if the PDF requires OCR, i.e., it
doesn't contain any fonts.
:param filename: of PDF file
:return: True if requires OCR, False if not
Expand source code
def requires_ocr(filename):
    """
    ```
    Uses pdffonts program to determine if the PDF requires OCR, i.e., it
    doesn't contain any fonts.
    :param filename: of PDF file
    :return: True if requires OCR, False if not
    ```
    """
    output = Popen(['pdffonts', filename], stdout=PIPE,
                   stderr=DEVNULL).communicate()[0]
    return len(output.split('\n')) < 4
def sent_tokenize(text, lang=None)
segment text into sentences
Expand source code
def sent_tokenize(text, lang=None):
    """
    ```
    segment text into sentences
    ```
    """
    lang = detect_lang(text) if lang is None else lang
    sents = []
    if is_chinese(lang):
        for sent in re.findall(u'[^!?。\.\!\?]+[!?。\.\!\?]?', text, flags=re.U):
            sents.append(sent)
    else:
        for paragraph in segmenter.process(text):
            for sentence in paragraph:
                sents.append(" ".join([t.value for t in sentence]))
    return sents
def split_chinese(texts)
Expand source code
def split_chinese(texts):
    if isinstance(texts, str): texts=[texts]

    split_texts = []
    for doc in texts:
        seg_list = jieba.cut(doc, cut_all=False)
        seg_list = list(seg_list)
        split_texts.append(seg_list)
    return [" ".join(tokens) for tokens in split_texts]
def strip_control_characters(data)
Expand source code
def strip_control_characters(data):
    if data:
        # unicode invalid characters
        re_xml_illegal = (
            '([\u0000-\u0008\u000b-\u000c\u000e-\u001f\ufffe-\uffff])|'
            '([%s-%s][^%s-%s])|([^%s-%s][%s-%s])|([%s-%s]$)|(^[%s-%s])'
            % (chr(0xd800), chr(0xdbff), chr(0xdc00), chr(0xdfff), chr(0xd800),
               chr(0xdbff), chr(0xdc00), chr(0xdfff), chr(0xd800), chr(0xdbff),
               chr(0xdc00), chr(0xdfff))
        )
        data = re.sub(re_xml_illegal, "", data)
        # ascii control characters
        #data = re.sub(r"[\x01-\x1F\x7F]", "", data)
        # See:  http://w3.org/International/questions/qa-forms-utf-8.html
        # Printable utf-8 does not include any of these chars below x7F
        data = re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F]", "", data)
    return data
def to_ascii(data)

Transform accentuated unicode symbols into ascii or nothing

Warning: this solution is only suited for languages that have a direct transliteration to ASCII symbols.

A better solution would be to use transliteration based on a precomputed unidecode map to be used by translate as explained here:

<http://stackoverflow.com/questions/2854230/>
Expand source code
def to_ascii(data):
    """Transform accentuated unicode symbols into ascii or nothing

    Warning: this solution is only suited for languages that have a direct
    transliteration to ASCII symbols.

    A better solution would be to use transliteration based on a precomputed
    unidecode map to be used by translate as explained here:

        http://stackoverflow.com/questions/2854230/

    """
    import unicodedata
    if isinstance(data, bytes):
        data = data.decode()
    nkfd_form = unicodedata.normalize('NFKD', data)
    only_ascii = nkfd_form.encode('ASCII', 'ignore')

    # Return a string
    return only_ascii.decode('ascii')
def tokenize(s, join_tokens=False, join_char=' ')
Expand source code
def tokenize(s, join_tokens=False, join_char=' '): 
    tokens = re_tok.sub(r' \1 ', s).split()
    if join_tokens: tokens = join_char.join(tokens)
    return tokens