Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def __further_tokenize(self, text):
# unescape
text = Regex(r"&pipe;").sub(r"|", text)
text = Regex(r"<").sub(r"<", text)
text = Regex(r">").sub(r">", text)
text = Regex(r"&").sub(r"&", text)
text = Regex(r"([\p{N}\p{P}\p{S}])([\p{L}])").sub(r"\1 \2", text)
text = Regex(r"([\p{L}\p{P}\p{S}])([\p{N}])").sub(r"\1 \2", text)
text = Regex(r"([\p{L}\p{N}\p{S}])([\p{P}])").sub(r"\1 \2", text)
text = Regex(r"([\p{L}\p{N}\p{P}])([\p{S}])").sub(r"\1 \2", text)
text = Regex(r"([\p{L}])([\p{N}\p{P}\p{S}])").sub(r"\1 \2", text)
text = Regex(r"([\p{N}])([\p{L}\p{P}\p{S}])").sub(r"\1 \2", text)
text = Regex(r"([\p{P}])([\p{L}\p{N}\p{S}])").sub(r"\1 \2", text)
text = Regex(r"([\p{S}])([\p{L}\p{N}\p{P}])").sub(r"\1 \2", text)
# re-escape
text = Regex(r"&").sub(r"&", text)
text = Regex(r"\|").sub(r"&pipe;", text)
text = Regex(r"<").sub(r"<", text)
text = Regex(r">").sub(r">", text)
return text
def __init__(self, options={}):
"""\
Constructor (pre-compile all needed regexes).
"""
# load no-break prefixes for the given language
self.__load_nobreaks(options.get('language'),
options.get('nobreak_file'))
# compile regexes
self.__spaces = Regex(r'\s+')
self.__space_at_end = Regex(r'(^|\n) ')
self.__space_at_begin = Regex(r' ($|\n)')
self.__non_period = Regex(r'([?!]|\.{2,}) +' + self.SENT_STARTER)
self.__in_punct = Regex(r'([?!\.] *' + self.FINAL_PUNCT + r') +' +
self.SENT_STARTER)
self.__punct_follows = Regex(r'([?!\.]) +' + self.SENT_STARTER_PUNCT)
self.__period = Regex(r'([\p{Alnum}\.\-]+)(' + self.FINAL_PUNCT +
r')? *$')
self.__ucase_acronym = Regex(r'\.[\p{Upper}\-]+$')
self.__numbers = Regex(r'^\p{N}')
self.__sent_starter = Regex(self.SENT_STARTER)
self.moses_escape = True if options.get('moses_escape') else False
# compile regexes
self.__spaces = Regex(r'\s+', flags=UNICODE)
self.__ascii_junk = Regex(r'[\000-\037]')
self.__special_chars = \
Regex(r'(([^\p{IsAlnum}\s\.\,−\-])\2*)')
# single quotes: all unicode quotes + prime
self.__to_single_quotes = Regex(r'[`‛‚‘’‹›′]')
# double quotes: all unicode chars incl. Chinese + double prime + ditto
self.__to_double_quotes = Regex(r'(\'\'|``|[«»„‟“”″〃「」『』〝〞〟])')
self.__no_numbers = Regex(r'([^\p{N}])([,.])([^\p{N}])')
self.__pre_numbers = Regex(r'([^\p{N}])([,.])([\p{N}])')
self.__post_numbers = Regex(r'([\p{N}])([,.])([^\p{N}])')
# hyphen: separate every time but for unary minus
self.__minus = Regex(r'([-−])')
self.__pre_notnum = Regex(r'(-)([^\p{N}])')
self.__post_num_or_nospace = Regex(r'(\p{N} *|[^ ])(-)')
# load no-break prefixes for the given language
self.__load_nobreaks(options.get('language'),
options.get('nobreak_file'))
# compile regexes
self.__spaces = Regex(r'\s+')
self.__space_at_end = Regex(r'(^|\n) ')
self.__space_at_begin = Regex(r' ($|\n)')
self.__non_period = Regex(r'([?!]|\.{2,}) +' + self.SENT_STARTER)
self.__in_punct = Regex(r'([?!\.] *' + self.FINAL_PUNCT + r') +' +
self.SENT_STARTER)
self.__punct_follows = Regex(r'([?!\.]) +' + self.SENT_STARTER_PUNCT)
self.__period = Regex(r'([\p{Alnum}\.\-]+)(' + self.FINAL_PUNCT +
r')? *$')
self.__ucase_acronym = Regex(r'\.[\p{Upper}\-]+$')
self.__numbers = Regex(r'^\p{N}')
self.__sent_starter = Regex(self.SENT_STARTER)
def __further_tokenize(self, text):
# unescape
text = Regex(r"&pipe;").sub(r"|", text)
text = Regex(r"<").sub(r"<", text)
text = Regex(r">").sub(r">", text)
text = Regex(r"&").sub(r"&", text)
text = Regex(r"([\p{N}\p{P}\p{S}])([\p{L}])").sub(r"\1 \2", text)
text = Regex(r"([\p{L}\p{P}\p{S}])([\p{N}])").sub(r"\1 \2", text)
text = Regex(r"([\p{L}\p{N}\p{S}])([\p{P}])").sub(r"\1 \2", text)
text = Regex(r"([\p{L}\p{N}\p{P}])([\p{S}])").sub(r"\1 \2", text)
text = Regex(r"([\p{L}])([\p{N}\p{P}\p{S}])").sub(r"\1 \2", text)
text = Regex(r"([\p{N}])([\p{L}\p{P}\p{S}])").sub(r"\1 \2", text)
text = Regex(r"([\p{P}])([\p{L}\p{N}\p{S}])").sub(r"\1 \2", text)
text = Regex(r"([\p{S}])([\p{L}\p{N}\p{P}])").sub(r"\1 \2", text)
# re-escape
text = Regex(r"&").sub(r"&", text)
text = Regex(r"\|").sub(r"&pipe;", text)
text = Regex(r"<").sub(r"<", text)
text = Regex(r">").sub(r">", text)
return text
def __further_tokenize(self, text):
# unescape
text = Regex(r"&pipe;").sub(r"|", text)
text = Regex(r"<").sub(r"<", text)
text = Regex(r">").sub(r">", text)
text = Regex(r"&").sub(r"&", text)
text = Regex(r"([\p{N}\p{P}\p{S}])([\p{L}])").sub(r"\1 \2", text)
text = Regex(r"([\p{L}\p{P}\p{S}])([\p{N}])").sub(r"\1 \2", text)
text = Regex(r"([\p{L}\p{N}\p{S}])([\p{P}])").sub(r"\1 \2", text)
text = Regex(r"([\p{L}\p{N}\p{P}])([\p{S}])").sub(r"\1 \2", text)
text = Regex(r"([\p{L}])([\p{N}\p{P}\p{S}])").sub(r"\1 \2", text)
text = Regex(r"([\p{N}])([\p{L}\p{P}\p{S}])").sub(r"\1 \2", text)
text = Regex(r"([\p{P}])([\p{L}\p{N}\p{S}])").sub(r"\1 \2", text)
text = Regex(r"([\p{S}])([\p{L}\p{N}\p{P}])").sub(r"\1 \2", text)
# re-escape
text = Regex(r"&").sub(r"&", text)
text = Regex(r"\|").sub(r"&pipe;", text)
text = Regex(r"<").sub(r"<", text)
text = Regex(r">").sub(r">", text)
return text
text = Regex(r"([\p{N}\p{P}\p{S}])([\p{L}])").sub(r"\1 \2", text)
text = Regex(r"([\p{L}\p{P}\p{S}])([\p{N}])").sub(r"\1 \2", text)
text = Regex(r"([\p{L}\p{N}\p{S}])([\p{P}])").sub(r"\1 \2", text)
text = Regex(r"([\p{L}\p{N}\p{P}])([\p{S}])").sub(r"\1 \2", text)
text = Regex(r"([\p{L}])([\p{N}\p{P}\p{S}])").sub(r"\1 \2", text)
text = Regex(r"([\p{N}])([\p{L}\p{P}\p{S}])").sub(r"\1 \2", text)
text = Regex(r"([\p{P}])([\p{L}\p{N}\p{S}])").sub(r"\1 \2", text)
text = Regex(r"([\p{S}])([\p{L}\p{N}\p{P}])").sub(r"\1 \2", text)
# re-escape
text = Regex(r"&").sub(r"&", text)
text = Regex(r"\|").sub(r"&pipe;", text)
text = Regex(r"<").sub(r"<", text)
text = Regex(r">").sub(r">", text)
return text
def __further_tokenize(self, text):
# unescape
text = Regex(r"&pipe;").sub(r"|", text)
text = Regex(r"<").sub(r"<", text)
text = Regex(r">").sub(r">", text)
text = Regex(r"&").sub(r"&", text)
text = Regex(r"([\p{N}\p{P}\p{S}])([\p{L}])").sub(r"\1 \2", text)
text = Regex(r"([\p{L}\p{P}\p{S}])([\p{N}])").sub(r"\1 \2", text)
text = Regex(r"([\p{L}\p{N}\p{S}])([\p{P}])").sub(r"\1 \2", text)
text = Regex(r"([\p{L}\p{N}\p{P}])([\p{S}])").sub(r"\1 \2", text)
text = Regex(r"([\p{L}])([\p{N}\p{P}\p{S}])").sub(r"\1 \2", text)
text = Regex(r"([\p{N}])([\p{L}\p{P}\p{S}])").sub(r"\1 \2", text)
text = Regex(r"([\p{P}])([\p{L}\p{N}\p{S}])").sub(r"\1 \2", text)
text = Regex(r"([\p{S}])([\p{L}\p{N}\p{P}])").sub(r"\1 \2", text)
# re-escape
text = Regex(r"&").sub(r"&", text)
text = Regex(r"\|").sub(r"&pipe;", text)
text = Regex(r"<").sub(r"<", text)
text = Regex(r">").sub(r">", text)
return text
Constructor (pre-compile all needed regexes).
"""
# process options
self.lowercase = True if options.get('lowercase') else False
self.moses_escape = True if options.get('moses_escape') else False
# compile regexes
self.__spaces = Regex(r'\s+', flags=UNICODE)
self.__ascii_junk = Regex(r'[\000-\037]')
self.__special_chars = \
Regex(r'(([^\p{IsAlnum}\s\.\,−\-])\2*)')
# single quotes: all unicode quotes + prime
self.__to_single_quotes = Regex(r'[`‛‚‘’‹›′]')
# double quotes: all unicode chars incl. Chinese + double prime + ditto
self.__to_double_quotes = Regex(r'(\'\'|``|[«»„‟“”″〃「」『』〝〞〟])')
self.__no_numbers = Regex(r'([^\p{N}])([,.])([^\p{N}])')
self.__pre_numbers = Regex(r'([^\p{N}])([,.])([\p{N}])')
self.__post_numbers = Regex(r'([\p{N}])([,.])([^\p{N}])')
# hyphen: separate every time but for unary minus
self.__minus = Regex(r'([-−])')
self.__pre_notnum = Regex(r'(-)([^\p{N}])')
self.__post_num_or_nospace = Regex(r'(\p{N} *|[^ ])(-)')