Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# Keep track of no. of quotation marks.
quote_counts = {u"'": 0, u'"': 0, u"``": 0, u"`": 0, u"''": 0}
# The *prepend_space* variable is used to control the "effects" of
# detokenization as the function loops through the list of tokens and
# changes the *prepend_space* accordingly as it sequentially checks
# through the language specific and language independent conditions.
prepend_space = " "
detokenized_text = ""
tokens = text.split()
# Iterate through every token and apply language specific detokenization rule(s).
for i, token in enumerate(iter(tokens)):
# Check if the first char is CJK.
if is_cjk(token[0]) and self.lang != "ko":
# Perform left shift if this is a second consecutive CJK word.
if i > 0 and is_cjk(tokens[i - 1][-1]):
detokenized_text += token
# But do nothing special if this is a CJK word that doesn't follow a CJK word
else:
detokenized_text += prepend_space + token
prepend_space = " "
# If it's a currency symbol.
elif re.search(u"^[" + self.IsSc + u"\(\[\{\¿\¡]+$", token):
# Perform right shift on currency and other random punctuation items
detokenized_text += prepend_space + token
prepend_space = ""
elif re.search(r"^[\,\.\?\!\:\;\\\%\}\]\)]+$", token):
# In French, these punctuations are prefixed with a non-breakable space.
if self.lang == "fr" and re.search(r"^[\?\!\:\;\\\%]$", token):
detokenized_text += " "
# Perform left shift on punctuation items.