Module ktrain.text.textutils
Expand source code
from ..imports import *
from subprocess import Popen, PIPE, DEVNULL
DEFAULT_TOKEN_PATTERN = (r"\b[a-zA-Z][a-zA-Z0-9]*(?:[_/&-][a-zA-Z0-9]+)+\b|"
r"\b\d*[a-zA-Z][a-zA-Z0-9][a-zA-Z0-9]+\b")
def extract_copy(corpus_path, output_path, verbose=0):
"""
```
Crawl <corpus_path>, extract plain text from documents
and then copy them to output_path.
Requires textract package
Args:
corpus_path(str): root folder containing documents
output_path(str): root folder of output directory
verbose(bool): Default:0. Set to 1 (or True) to see error details on why each skipped document was skipped.
Returns:
list: list of skipped filenames
```
"""
try:
# TODO: change this to use TextExtractor
import textract
except ImportError:
raise Exception('extract_copy requires textract: pip install textract')
skipped = set()
num_skipped = 0
corpus_path = os.path.normpath(corpus_path)
output_path = os.path.normpath(output_path)
for idx, filename in enumerate(extract_filenames(corpus_path)):
if idx %1000 == 0: print('processed %s doc(s)' % (idx+1))
mtype = get_mimetype(filename)
try:
if mtype and mtype.split('/')[0] == 'text':
with open(filename, 'r') as f:
text = f.read()
text = str.encode(text)
else:
text = textract.process(filename)
except Exception as e:
if verbose:
print('ERROR on %s:\n%s' % (filename, e))
num_skipped += 1
if not mtype:
mtype = os.path.splitext(filename)[1]
if not mtype: mtype == 'unknown'
skipped.add(mtype)
continue
if not text:
num_skipped += 1
continue
fpath, fname = os.path.split(filename)
if mtype and mtype.split('/')[0] != 'text': fname = fname+'.txt'
relfpath = fpath.replace(corpus_path, '')
relfpath = relfpath[1:] if relfpath and relfpath[0] == os.sep else relfpath
opath = os.path.join(output_path, relfpath)
if not os.path.exists(opath):
os.makedirs(opath)
ofilename = os.path.join(opath, fname)
with open(ofilename, 'wb') as f:
f.write(text)
print('processed %s docs' % (idx+1))
print('done.')
print('skipped %s docs' % (num_skipped))
if skipped: print('%s' %(skipped))
def get_mimetype(filepath):
return mimetypes.guess_type(filepath)[0]
def is_txt(filepath, strict=False):
if strict:
return mimetypes.guess_type(filepath)[0] == 'text/plain'
else:
mtype = get_mimetype(filepath)
return mtype is not None and mtype.split('/')[0] == 'text'
def is_pdf(filepath):
return mimetypes.guess_type(filepath)[0] == 'application/pdf'
def pdftotext(filename):
"""
```
Use pdftotext program to convert PDF to text string.
:param filename: of PDF file
:return: text from file, or empty string if failure
```
"""
output = Popen(['pdftotext', '-q', filename, '-'],
stdout=PIPE).communicate()[0]
# None may indicate damage, but convert for consistency
return '' if output is None else output
def requires_ocr(filename):
"""
```
Uses pdffonts program to determine if the PDF requires OCR, i.e., it
doesn't contain any fonts.
:param filename: of PDF file
:return: True if requires OCR, False if not
```
"""
output = Popen(['pdffonts', filename], stdout=PIPE,
stderr=DEVNULL).communicate()[0]
return len(output.split('\n')) < 4
def extract_filenames(corpus_path, follow_links=False):
if os.listdir(corpus_path) == []:
raise ValueError("%s: path is empty" % corpus_path)
walk = os.walk
for root, dirs, filenames in walk(corpus_path, followlinks=follow_links):
for filename in filenames:
try:
yield os.path.join(root, filename)
except:
continue
def strip_control_characters(data):
if data:
# unicode invalid characters
re_xml_illegal = (
'([\u0000-\u0008\u000b-\u000c\u000e-\u001f\ufffe-\uffff])|'
'([%s-%s][^%s-%s])|([^%s-%s][%s-%s])|([%s-%s]$)|(^[%s-%s])'
% (chr(0xd800), chr(0xdbff), chr(0xdc00), chr(0xdfff), chr(0xd800),
chr(0xdbff), chr(0xdc00), chr(0xdfff), chr(0xd800), chr(0xdbff),
chr(0xdc00), chr(0xdfff))
)
data = re.sub(re_xml_illegal, "", data)
# ascii control characters
#data = re.sub(r"[\x01-\x1F\x7F]", "", data)
# See: http://w3.org/International/questions/qa-forms-utf-8.html
# Printable utf-8 does not include any of these chars below x7F
data = re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F]", "", data)
return data
def to_ascii(data):
"""Transform accentuated unicode symbols into ascii or nothing
Warning: this solution is only suited for languages that have a direct
transliteration to ASCII symbols.
A better solution would be to use transliteration based on a precomputed
unidecode map to be used by translate as explained here:
http://stackoverflow.com/questions/2854230/
"""
import unicodedata
if isinstance(data, bytes):
data = data.decode()
nkfd_form = unicodedata.normalize('NFKD', data)
only_ascii = nkfd_form.encode('ASCII', 'ignore')
# Return a string
return only_ascii.decode('ascii')
def load_text_files(corpus_path, truncate_len=None,
clean=True, return_fnames=False):
"""
```
load text files
```
"""
texts = []
filenames = []
mb = master_bar(range(1))
for i in mb:
for filename in progress_bar(list(extract_filenames(corpus_path)), parent=mb):
with open(filename, 'r') as f:
text = f.read()
if clean:
text = strip_control_characters(text)
text = to_ascii(text)
if truncate_len is not None:
text = " ".join(text.split()[:truncate_len])
texts.append(text)
filenames.append(filename)
mb.write('done.')
if return_fnames:
return (texts, filenames)
else:
return texts
def filter_by_id(lst, ids=[]):
"""
```
filter list by supplied IDs
```
"""
return [x for i,x in enumerate(lst) if i in ids]
#------------------------------------------------------------------------------
# Language-Handling
#------------------------------------------------------------------------------
def chinese_stopwords():
with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'stopwords-zh.txt'), 'r') as f:
return [line.strip() for line in f]
def detect_lang(texts, sample_size=32):
"""
```
detect language
```
"""
# convert sentence pairs
if isinstance(texts, (tuple, list, np.ndarray)) and len(texts) == 2:
texts = [texts[0], texts[1]]
elif isinstance(texts, (tuple, list, np.ndarray)) and isinstance(texts[0], (tuple, list, np.ndarray)) and len(texts[0]) == 2:
texts = [t[0] for t in texts]
if isinstance(texts, (pd.Series, pd.DataFrame)):
texts = texts.values
if isinstance(texts, str): texts = [texts]
if not isinstance(texts, (list, np.ndarray)):
raise ValueError('texts must be a list or NumPy array of strings')
lst = []
for doc in texts[:sample_size]:
try:
lst.append(langdetect.detect(doc))
except:
continue
if len(lst) == 0:
warnings.warn('Defaulting to English for language detection: could not detect language from documents. '+\
'This may be due to empty or invalid texts being provided to detect_lang.')
lang = 'en'
else:
lang = max(set(lst), key=lst.count)
#return max(set(lst), key=lst.count)
return lang
def is_chinese(lang, strict=True):
"""
```
Args:
lang(str): language code (e.g., en)
strict(bool): If False, include additional languages due to mistakes on short texts by langdetect
```
"""
if strict:
extra_clause = False
else:
extra_clause = lang in ['ja', 'ko']
return lang is not None and lang.startswith('zh-') or extra_clause
def split_chinese(texts):
if isinstance(texts, str): texts=[texts]
split_texts = []
for doc in texts:
seg_list = jieba.cut(doc, cut_all=False)
seg_list = list(seg_list)
split_texts.append(seg_list)
return [" ".join(tokens) for tokens in split_texts]
NOSPACE_LANGS = ['zh-cn', 'zh-tw', 'ja']
def is_nospace_lang(lang):
return lang in NOSPACE_LANGS
def decode_by_line(texts, encoding='utf-8', verbose=1):
"""
```
Decode text line by line and skip over errors.
```
"""
if isinstance(texts, str): texts = [texts]
new_texts = []
skips=0
num_lines = 0
for doc in texts:
text = ""
for line in doc.splitlines():
num_lines +=1
try:
line = line.decode(encoding)
except:
skips +=1
continue
text += line
new_texts.append(text)
pct = round((skips*1./num_lines) * 100, 1)
if verbose:
print('skipped %s lines (%s%%) due to character decoding errors' % (skips, pct))
if pct > 10:
print('If this is too many, try a different encoding')
return new_texts
def detect_encoding(texts, sample_size=32):
if not isinstance(texts, list): texts = [texts] # check for instance of list as bytes are supplied as input
lst = [chardet.detect(doc)['encoding'] for doc in texts[:sample_size]]
encoding = max(set(lst), key=lst.count)
# standardize to utf-8 to prevent BERT problems
encoding = 'utf-8' if encoding.lower() in ['ascii', 'utf8', 'utf-8'] else encoding
return encoding
def read_text(filename):
with open(filename, 'rb') as f:
text = f.read()
encoding = detect_encoding([text])
try:
decoded_text = text.decode(encoding)
except:
U.vprint('Decoding with %s failed 1st attempt - using %s with skips' % (encoding,
encoding),
verbose=verbose)
decoded_text = decode_by_line(text, encoding=encoding)
return decoded_text.strip()
#tokenizer_filter = rs='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s, join_tokens=False, join_char=' '):
tokens = re_tok.sub(r' \1 ', s).split()
if join_tokens: tokens = join_char.join(tokens)
return tokens
def sent_tokenize(text, lang=None):
"""
```
segment text into sentences
```
"""
lang = detect_lang(text) if lang is None else lang
sents = []
if is_chinese(lang):
for sent in re.findall(u'[^!?。\.\!\?]+[!?。\.\!\?]?', text, flags=re.U):
sents.append(sent)
else:
for paragraph in segmenter.process(text):
for sentence in paragraph:
sents.append(" ".join([t.value for t in sentence]))
return sents
def paragraph_tokenize(text, join_sentences=False, lang=None):
"""
```
segment text into paragraphs
```
"""
lang = detect_lang(text) if lang is None else lang
if is_chinese(lang):
raise ValueError('paragraph_tokenize does not currently support Chinese.')
paragraphs = []
sents = []
for paragraph in segmenter.process(text):
sents = []
for sentence in paragraph:
sents.append(" ".join([t.value for t in sentence]))
if join_sentences: sents = ' '.join(sents)
paragraphs.append(sents)
return paragraphs
def extract_noun_phrases(text):
"""
```
extracts noun phrases
```
"""
try:
from textblob import TextBlob
except:
raise Exception('extract_noun_phrases require TextBlob: pip install textblob')
blob = TextBlob(text)
stop_words = ['which', 'what']
curr_phrase = []
np_list = []
start = False
for token in blob.tags:
if token[1].startswith('J') or token[1].startswith('N'):
if not start: start = True
if token[0].lower() not in stop_words: curr_phrase.append(token[0])
else:
if start:
np_list.append(" ".join(curr_phrase))
curr_phrase = []
start = False
if start: np_list.append(" ".join(curr_phrase))
return np_list
def extract_offsets(sentence, tokens=None, tokenizer=tokenize):
"""
```
extracts character
Args:
sentence (str): text
tokens (list): list of tokens from sentence. If None, tokens will be generated using supplied tokenizer.
tokenizer (Callable): a callable that accepts text and returns a list of tokens
Return:
list of dictionaries of the form {'token': <the token>, 'start': start character index, 'end': end character index}
```
"""
s = sentence
tokens = tokenizer(sentence)
offsets = []
last_end = 0
for t in tokens:
# find start of current token
for start_ind in range(last_end, len(sentence)):
if sentence[start_ind] == t[0]: break
end_ind = len(sentence)
for end_ind in range(start_ind+1, len(sentence)):
if (end_ind-start_ind) >= len(t):
break
d = {'token': t,
'start' : start_ind, 'end' : end_ind,
}
offsets.append(d)
last_end = end_ind
return offsets
Functions
def chinese_stopwords()
-
Expand source code
def chinese_stopwords(): with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'stopwords-zh.txt'), 'r') as f: return [line.strip() for line in f]
def decode_by_line(texts, encoding='utf-8', verbose=1)
-
Decode text line by line and skip over errors.
Expand source code
def decode_by_line(texts, encoding='utf-8', verbose=1): """ ``` Decode text line by line and skip over errors. ``` """ if isinstance(texts, str): texts = [texts] new_texts = [] skips=0 num_lines = 0 for doc in texts: text = "" for line in doc.splitlines(): num_lines +=1 try: line = line.decode(encoding) except: skips +=1 continue text += line new_texts.append(text) pct = round((skips*1./num_lines) * 100, 1) if verbose: print('skipped %s lines (%s%%) due to character decoding errors' % (skips, pct)) if pct > 10: print('If this is too many, try a different encoding') return new_texts
def detect_encoding(texts, sample_size=32)
-
Expand source code
def detect_encoding(texts, sample_size=32): if not isinstance(texts, list): texts = [texts] # check for instance of list as bytes are supplied as input lst = [chardet.detect(doc)['encoding'] for doc in texts[:sample_size]] encoding = max(set(lst), key=lst.count) # standardize to utf-8 to prevent BERT problems encoding = 'utf-8' if encoding.lower() in ['ascii', 'utf8', 'utf-8'] else encoding return encoding
def detect_lang(texts, sample_size=32)
-
detect language
Expand source code
def detect_lang(texts, sample_size=32): """ ``` detect language ``` """ # convert sentence pairs if isinstance(texts, (tuple, list, np.ndarray)) and len(texts) == 2: texts = [texts[0], texts[1]] elif isinstance(texts, (tuple, list, np.ndarray)) and isinstance(texts[0], (tuple, list, np.ndarray)) and len(texts[0]) == 2: texts = [t[0] for t in texts] if isinstance(texts, (pd.Series, pd.DataFrame)): texts = texts.values if isinstance(texts, str): texts = [texts] if not isinstance(texts, (list, np.ndarray)): raise ValueError('texts must be a list or NumPy array of strings') lst = [] for doc in texts[:sample_size]: try: lst.append(langdetect.detect(doc)) except: continue if len(lst) == 0: warnings.warn('Defaulting to English for language detection: could not detect language from documents. '+\ 'This may be due to empty or invalid texts being provided to detect_lang.') lang = 'en' else: lang = max(set(lst), key=lst.count) #return max(set(lst), key=lst.count) return lang
def extract_copy(corpus_path, output_path, verbose=0)
-
Crawl <corpus_path>, extract plain text from documents and then copy them to output_path. Requires textract package Args: corpus_path(str): root folder containing documents output_path(str): root folder of output directory verbose(bool): Default:0. Set to 1 (or True) to see error details on why each skipped document was skipped. Returns: list: list of skipped filenames
Expand source code
def extract_copy(corpus_path, output_path, verbose=0): """ ``` Crawl <corpus_path>, extract plain text from documents and then copy them to output_path. Requires textract package Args: corpus_path(str): root folder containing documents output_path(str): root folder of output directory verbose(bool): Default:0. Set to 1 (or True) to see error details on why each skipped document was skipped. Returns: list: list of skipped filenames ``` """ try: # TODO: change this to use TextExtractor import textract except ImportError: raise Exception('extract_copy requires textract: pip install textract') skipped = set() num_skipped = 0 corpus_path = os.path.normpath(corpus_path) output_path = os.path.normpath(output_path) for idx, filename in enumerate(extract_filenames(corpus_path)): if idx %1000 == 0: print('processed %s doc(s)' % (idx+1)) mtype = get_mimetype(filename) try: if mtype and mtype.split('/')[0] == 'text': with open(filename, 'r') as f: text = f.read() text = str.encode(text) else: text = textract.process(filename) except Exception as e: if verbose: print('ERROR on %s:\n%s' % (filename, e)) num_skipped += 1 if not mtype: mtype = os.path.splitext(filename)[1] if not mtype: mtype == 'unknown' skipped.add(mtype) continue if not text: num_skipped += 1 continue fpath, fname = os.path.split(filename) if mtype and mtype.split('/')[0] != 'text': fname = fname+'.txt' relfpath = fpath.replace(corpus_path, '') relfpath = relfpath[1:] if relfpath and relfpath[0] == os.sep else relfpath opath = os.path.join(output_path, relfpath) if not os.path.exists(opath): os.makedirs(opath) ofilename = os.path.join(opath, fname) with open(ofilename, 'wb') as f: f.write(text) print('processed %s docs' % (idx+1)) print('done.') print('skipped %s docs' % (num_skipped)) if skipped: print('%s' %(skipped))
def extract_filenames(corpus_path, follow_links=False)
-
Expand source code
def extract_filenames(corpus_path, follow_links=False): if os.listdir(corpus_path) == []: raise ValueError("%s: path is empty" % corpus_path) walk = os.walk for root, dirs, filenames in walk(corpus_path, followlinks=follow_links): for filename in filenames: try: yield os.path.join(root, filename) except: continue
def extract_noun_phrases(text)
-
extracts noun phrases
Expand source code
def extract_noun_phrases(text): """ ``` extracts noun phrases ``` """ try: from textblob import TextBlob except: raise Exception('extract_noun_phrases require TextBlob: pip install textblob') blob = TextBlob(text) stop_words = ['which', 'what'] curr_phrase = [] np_list = [] start = False for token in blob.tags: if token[1].startswith('J') or token[1].startswith('N'): if not start: start = True if token[0].lower() not in stop_words: curr_phrase.append(token[0]) else: if start: np_list.append(" ".join(curr_phrase)) curr_phrase = [] start = False if start: np_list.append(" ".join(curr_phrase)) return np_list
def extract_offsets(sentence, tokens=None, tokenizer=<function tokenize>)
-
extracts character Args: sentence (str): text tokens (list): list of tokens from sentence. If None, tokens will be generated using supplied tokenizer. tokenizer (Callable): a callable that accepts text and returns a list of tokens Return: list of dictionaries of the form {'token': <the token>, 'start': start character index, 'end': end character index}
Expand source code
def extract_offsets(sentence, tokens=None, tokenizer=tokenize): """ ``` extracts character Args: sentence (str): text tokens (list): list of tokens from sentence. If None, tokens will be generated using supplied tokenizer. tokenizer (Callable): a callable that accepts text and returns a list of tokens Return: list of dictionaries of the form {'token': <the token>, 'start': start character index, 'end': end character index} ``` """ s = sentence tokens = tokenizer(sentence) offsets = [] last_end = 0 for t in tokens: # find start of current token for start_ind in range(last_end, len(sentence)): if sentence[start_ind] == t[0]: break end_ind = len(sentence) for end_ind in range(start_ind+1, len(sentence)): if (end_ind-start_ind) >= len(t): break d = {'token': t, 'start' : start_ind, 'end' : end_ind, } offsets.append(d) last_end = end_ind return offsets
def filter_by_id(lst, ids=[])
-
filter list by supplied IDs
Expand source code
def filter_by_id(lst, ids=[]): """ ``` filter list by supplied IDs ``` """ return [x for i,x in enumerate(lst) if i in ids]
def get_mimetype(filepath)
-
Expand source code
def get_mimetype(filepath): return mimetypes.guess_type(filepath)[0]
def is_chinese(lang, strict=True)
-
Args: lang(str): language code (e.g., en) strict(bool): If False, include additional languages due to mistakes on short texts by langdetect
Expand source code
def is_chinese(lang, strict=True): """ ``` Args: lang(str): language code (e.g., en) strict(bool): If False, include additional languages due to mistakes on short texts by langdetect ``` """ if strict: extra_clause = False else: extra_clause = lang in ['ja', 'ko'] return lang is not None and lang.startswith('zh-') or extra_clause
def is_nospace_lang(lang)
-
Expand source code
def is_nospace_lang(lang): return lang in NOSPACE_LANGS
def is_pdf(filepath)
-
Expand source code
def is_pdf(filepath): return mimetypes.guess_type(filepath)[0] == 'application/pdf'
def is_txt(filepath, strict=False)
-
Expand source code
def is_txt(filepath, strict=False): if strict: return mimetypes.guess_type(filepath)[0] == 'text/plain' else: mtype = get_mimetype(filepath) return mtype is not None and mtype.split('/')[0] == 'text'
def load_text_files(corpus_path, truncate_len=None, clean=True, return_fnames=False)
-
load text files
Expand source code
def load_text_files(corpus_path, truncate_len=None, clean=True, return_fnames=False): """ ``` load text files ``` """ texts = [] filenames = [] mb = master_bar(range(1)) for i in mb: for filename in progress_bar(list(extract_filenames(corpus_path)), parent=mb): with open(filename, 'r') as f: text = f.read() if clean: text = strip_control_characters(text) text = to_ascii(text) if truncate_len is not None: text = " ".join(text.split()[:truncate_len]) texts.append(text) filenames.append(filename) mb.write('done.') if return_fnames: return (texts, filenames) else: return texts
def paragraph_tokenize(text, join_sentences=False, lang=None)
-
segment text into paragraphs
Expand source code
def paragraph_tokenize(text, join_sentences=False, lang=None): """ ``` segment text into paragraphs ``` """ lang = detect_lang(text) if lang is None else lang if is_chinese(lang): raise ValueError('paragraph_tokenize does not currently support Chinese.') paragraphs = [] sents = [] for paragraph in segmenter.process(text): sents = [] for sentence in paragraph: sents.append(" ".join([t.value for t in sentence])) if join_sentences: sents = ' '.join(sents) paragraphs.append(sents) return paragraphs
def pdftotext(filename)
-
Use pdftotext program to convert PDF to text string. :param filename: of PDF file :return: text from file, or empty string if failure
Expand source code
def pdftotext(filename): """ ``` Use pdftotext program to convert PDF to text string. :param filename: of PDF file :return: text from file, or empty string if failure ``` """ output = Popen(['pdftotext', '-q', filename, '-'], stdout=PIPE).communicate()[0] # None may indicate damage, but convert for consistency return '' if output is None else output
def read_text(filename)
-
Expand source code
def read_text(filename): with open(filename, 'rb') as f: text = f.read() encoding = detect_encoding([text]) try: decoded_text = text.decode(encoding) except: U.vprint('Decoding with %s failed 1st attempt - using %s with skips' % (encoding, encoding), verbose=verbose) decoded_text = decode_by_line(text, encoding=encoding) return decoded_text.strip()
def requires_ocr(filename)
-
Uses pdffonts program to determine if the PDF requires OCR, i.e., it doesn't contain any fonts. :param filename: of PDF file :return: True if requires OCR, False if not
Expand source code
def requires_ocr(filename): """ ``` Uses pdffonts program to determine if the PDF requires OCR, i.e., it doesn't contain any fonts. :param filename: of PDF file :return: True if requires OCR, False if not ``` """ output = Popen(['pdffonts', filename], stdout=PIPE, stderr=DEVNULL).communicate()[0] return len(output.split('\n')) < 4
def sent_tokenize(text, lang=None)
-
segment text into sentences
Expand source code
def sent_tokenize(text, lang=None): """ ``` segment text into sentences ``` """ lang = detect_lang(text) if lang is None else lang sents = [] if is_chinese(lang): for sent in re.findall(u'[^!?。\.\!\?]+[!?。\.\!\?]?', text, flags=re.U): sents.append(sent) else: for paragraph in segmenter.process(text): for sentence in paragraph: sents.append(" ".join([t.value for t in sentence])) return sents
def split_chinese(texts)
-
Expand source code
def split_chinese(texts): if isinstance(texts, str): texts=[texts] split_texts = [] for doc in texts: seg_list = jieba.cut(doc, cut_all=False) seg_list = list(seg_list) split_texts.append(seg_list) return [" ".join(tokens) for tokens in split_texts]
def strip_control_characters(data)
-
Expand source code
def strip_control_characters(data): if data: # unicode invalid characters re_xml_illegal = ( '([\u0000-\u0008\u000b-\u000c\u000e-\u001f\ufffe-\uffff])|' '([%s-%s][^%s-%s])|([^%s-%s][%s-%s])|([%s-%s]$)|(^[%s-%s])' % (chr(0xd800), chr(0xdbff), chr(0xdc00), chr(0xdfff), chr(0xd800), chr(0xdbff), chr(0xdc00), chr(0xdfff), chr(0xd800), chr(0xdbff), chr(0xdc00), chr(0xdfff)) ) data = re.sub(re_xml_illegal, "", data) # ascii control characters #data = re.sub(r"[\x01-\x1F\x7F]", "", data) # See: http://w3.org/International/questions/qa-forms-utf-8.html # Printable utf-8 does not include any of these chars below x7F data = re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F]", "", data) return data
def to_ascii(data)
-
Transform accentuated unicode symbols into ascii or nothing
Warning: this solution is only suited for languages that have a direct transliteration to ASCII symbols.
A better solution would be to use transliteration based on a precomputed unidecode map to be used by translate as explained here:
<http://stackoverflow.com/questions/2854230/>
Expand source code
def to_ascii(data): """Transform accentuated unicode symbols into ascii or nothing Warning: this solution is only suited for languages that have a direct transliteration to ASCII symbols. A better solution would be to use transliteration based on a precomputed unidecode map to be used by translate as explained here: http://stackoverflow.com/questions/2854230/ """ import unicodedata if isinstance(data, bytes): data = data.decode() nkfd_form = unicodedata.normalize('NFKD', data) only_ascii = nkfd_form.encode('ASCII', 'ignore') # Return a string return only_ascii.decode('ascii')
def tokenize(s, join_tokens=False, join_char=' ')
-
Expand source code
def tokenize(s, join_tokens=False, join_char=' '): tokens = re_tok.sub(r' \1 ', s).split() if join_tokens: tokens = join_char.join(tokens) return tokens