Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_dale_chall(text, expected, nlp):
text = ftfy.fix_text(text)
text = " ".join(text.split())
doc = nlp(text)
assert pytest.approx(expected, rel=1e-2) == doc._.dale_chall
def test_linsear_write(text, expected, nlp):
text = ftfy.fix_text(text)
text = " ".join(text.split())
doc = nlp(text)
assert pytest.approx(expected, rel=1e-2) == doc._.linsear_write
def test_entities():
example = '&\n\n&'
assert fix_text(example) == '&\n\n&'
assert fix_text_segment(example) == '&\n\n&'
assert fix_text(example, fix_entities=True) == '&\n\n&'
assert fix_text_segment(example, fix_entities=True) == '&\n\n&'
assert fix_text(example, fix_entities=False) == '&\n\n&'
assert fix_text_segment(example, fix_entities=False) == '&\n\n&'
assert fix_text_segment('<>', fix_entities=False) == '<>'
assert fix_text_segment('<>', fix_entities=True) == '<>'
assert fix_text_segment('<>') == '<>'
assert fix_text_segment('jednocześnie') == 'jednocześnie'
assert fix_text_segment('JEDNOCZEŚNIE') == 'JEDNOCZEŚNIE'
assert fix_text_segment('ellipsis…', normalization='NFKC') == 'ellipsis...'
assert fix_text_segment('ellipsis…', normalization='NFKC') == 'ellipsis...'
assert fix_text_segment('broken') == 'broken\x81'
assert unescape_html('euro €') == 'euro €'
assert unescape_html('not an entity x6;') == 'not an entity x6;'
def encode_and_clean(self, text):
texts_bpes, texts_clean, lens_bpes = [], [], []
if True:
for text in texts:
text = self.nlp(text_standardize(ftfy.fix_text(text)))
text_tokens, text_bpe, len_bpe = [], [], []
for token in text:
token_text = token.text
text_tokens.append(token_text)
new_bpe = [self.encoder.get(t, 0) for t in self.bpe(token_text.lower()).split(' ')]
text_bpe.extend(new_bpe)
len_bpe.append(len(new_bpe))
texts_clean.append(' '.join(text_tokens)) # Reassemble
texts_bpes.append(text_bpe)
lens_bpes.append(len_bpe)
return texts_bpes, texts_clean, lens_bpes
fields = line.split('\t')
fields = list(map(str.strip, fields)) #Strip all elements
del fields[-1]
#sys.stderr.write("fields:" + str(len(fields)) + " " + str(fields) + "\n")
cleaner=Cleaner(style=True, links=True, add_nofollow=True,page_structure=False, safe_attrs_only=False)
# read file
file = open("{inDir}/{name}".format(inDir=args.inDir, name=lineNum), "r")
b64t = file.read()
file.close()
#sys.stderr.write("b64t:" + b64t + "\n")
try:
cleanhtml=cleaner.clean_html(re.sub(r'encoding *= *"[^"]+"', '', b64t, flags=re.IGNORECASE))
document = html5lib.parse(ftfy.fix_text(cleanhtml),treebuilder="lxml",namespaceHTMLElements=False)
tree=etree.tostring(document)
cleantree=tree.decode("utf8")
cleantree = cleantree.replace("\t", " ")
file = open("{outDir}/{name}".format(outDir=args.outDir, name=lineNum), "w")
file.write(cleantree)
file.close()
except etree.ParserError as err:
sys.stderr.write("HTML parsing error for document with URL '{1}': {0}\n".format(err, fields[0]))
lineNum += 1
def _text_standardize(text):
"""
Fixes some issues the spacy tokenizer had on books corpus
Also handles whitespace standardization
"""
text = text.replace('—', '-')
text = text.replace('–', '-')
text = text.replace('―', '-')
text = text.replace('…', '...')
text = text.replace('´', "'")
text = re.sub('''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
text = re.sub('\s*\n\s*', ' \n ', text)
text = re.sub('[^\S\n]+', ' ', text)
return ftfy.fix_text(text.strip().lower())
def collect_articles(urls, end_date, filename):
"""Loops over all the URLs collected in the parent function."""
for url in urls:
tree = parse_html(url)
config = page_config(tree)
try:
if end_date and dateParse(config["date"]) < dateParse(end_date):
break
else:
csv_writer = csv.writer(open(os.path.dirname(os.getcwd()) + "/../data/" + filename, "a"))
csv_writer.writerow([config["date"], ftfy.fix_text(config["title"]), url])
except:
print("\nEXCEPTION OCCURED\n")
pass
text3 = unidecode(text2)
text4 = unicodedata.normalize('NFC', text2)
text5 = unidecode(text4)
print(' text:', text, '| len:', len(text))
print(' expected:', expected, ' | len:', len(expected))
print(' text == expected:', text == expected)
print('-------------------------------------')
print('text.encode("cp437").decode("utf-8"):', text2, ' | len:', len(text2), '| expected:', text2 == expected)
print(' unicode(text2):', text3, ' | len:', len(text3), '| expected:', text3 == expected)
print('-------------------------------------')
print(' unicodedata.normalize("NFC", text2):', text4, ' | len:', len(text4), '| expected:', text4 == expected)
print(' unicode(text4):', text5, ' | len:', len(text5), '| expected:', text5 == expected)
print('-------------------------------------')
print(' ftfy.fix_text(text):', ftfy.fix_text(text))
print('-------------------------------------')
def clean_unicode(possible_string):
if isinstance(possible_string, basestring):
string = possible_string
string = string.strip()
string = string.decode('utf-8')
string = unicode(string)
string = ftfy.fix_text(string)
return string
return possible_string
def collect_articles(urls, source, args, filename):
"""Loops over all the URLs collected in the parent function."""
for url in urls:
tree = parse_html(url)
config = page_config(source, tree)
print(url)
if args.scrape_year and dateParse(config["date"]).year < int(args.scrape_year): break
elif args.scrape_year and int(str(dateParse(config["date"]).year)) != int(args.scrape_year): pass
else:
csv_writer = csv.writer(open(filename, "a"))
csv_writer.writerow([config["date"], ftfy.fix_text(config["title"]), url])