Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_surrogates():
assert fix_surrogates('\udbff\udfff') == '\U0010ffff'
assert fix_surrogates('\ud800\udc00') == '\U00010000'
def test_surrogates():
assert fix_surrogates('\udbff\udfff') == '\U0010ffff'
assert fix_surrogates('\ud800\udc00') == '\U00010000'
This function reads just the text (the part after the tab, if there is a tab). It
removes URLs and Twitter handles from the text. It then language-detects the
text, and if it is confident about the language, it outputs a new tab-separated
file containing the language code and the processed text.
This format could be read again by the same function, because the language code
is now the metadata, but we have no reason to actually do this.
"""
for line in infile:
if "\t" in line:
line = line.split("\t", 1)[1]
text = line.rstrip()
text = TWITTER_HANDLE_RE.sub("", text)
text = TCO_RE.sub("", text)
text = fix_surrogates(unescape_html(text)).replace("\n", " ")
lang, confident = detect_language(text)
if confident:
print(f"{lang}\t{text}", file=outfile)
if fix_entities:
text = fixes.unescape_html(text)
if remove_terminal_escapes:
text = fixes.remove_terminal_escapes(text)
if fix_encoding:
text = fixes.fix_text_encoding(text)
if fix_latin_ligatures:
text = fixes.fix_latin_ligatures(text)
if fix_character_width:
text = fixes.fix_character_width(text)
if uncurl_quotes:
text = fixes.uncurl_quotes(text)
if fix_line_breaks:
text = fixes.fix_line_breaks(text)
if fix_surrogates:
text = fixes.fix_surrogates(text)
if remove_control_chars:
text = fixes.remove_control_chars(text)
if remove_bom:
text = fixes.remove_bom(text)
if normalization is not None:
text = unicodedata.normalize(normalization, text)
if text == origtext:
return text
if remove_terminal_escapes:
text = fixes.remove_terminal_escapes(text)
if fix_encoding:
text = fixes.fix_encoding(text)
if fix_entities:
text = fixes.unescape_html(text)
if fix_latin_ligatures:
text = fixes.fix_latin_ligatures(text)
if fix_character_width:
text = fixes.fix_character_width(text)
if uncurl_quotes:
text = fixes.uncurl_quotes(text)
if fix_line_breaks:
text = fixes.fix_line_breaks(text)
if fix_surrogates:
text = fixes.fix_surrogates(text)
if remove_control_chars:
text = fixes.remove_control_chars(text)
if remove_bom and not remove_control_chars:
# Skip this step if we've already done `remove_control_chars`,
# because it would be redundant.
text = fixes.remove_bom(text)
if normalization is not None:
text = unicodedata.normalize(normalization, text)
if text == origtext:
return text
def tokenize_file(
infile, outfile, language, check_language=False, punctuation=False, ftfy=False
):
"""
Take in a file of plain text, tokenize it as the given language, and write
the result as lines of space-separated tokens.
"""
for line in infile:
if ftfy:
# Run all ftfy fixes, but don't let it introduce line breaks
line = fix_text(line.rstrip()).replace('\n', ' ')
else:
# Run only specific quick fixes from ftfy
line = fix_surrogates(unescape_html(line.rstrip()))
tokens = tokenize(
line, language, include_punctuation=punctuation, external_wordlist=True
)
checked_lang = None
if check_language:
checked_lang, _confident = detect_language(line.rstrip())
if (not check_language) or langcodes.tag_match_score(
checked_lang, language
) >= 90:
print(' '.join(tokens), file=outfile)