Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def __init__(self, **kwargs):
annotators: None or empty set (only tokenizes).
self._regexp = regex.compile(
'(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS),
flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE,
if len(kwargs.get('annotators', {})) > 0:
'%s only tokenizes! Skipping annotators: %s'
% (type(self).__name__, kwargs.get('annotators'))
self.annotators = set()
def has_answer(args, answer, t):
text = []
for i in range(len(t)):
res_list = []
if (args.dataset == "CuratedTrec"):
ans_regex = re.compile("(%s)"%answer[0], flags=re.IGNORECASE + re.UNICODE)
return False, res_list
paragraph = " ".join(text)
answer_new = ans_regex.findall(paragraph)
for a in answer_new:
single_answer = normalize(a[0])
single_answer = PROCESS_TOK.tokenize(single_answer)
single_answer = single_answer.words(uncased=True)
for i in range(0, len(text) - len(single_answer) + 1):
if single_answer == text[i: i + len(single_answer)]:
res_list.append((i, i+len(single_answer)-1))
for a in answer:
single_answer = " ".join(a).lower()
single_answer = normalize(single_answer)
single_answer = PROCESS_TOK.tokenize(single_answer)
def __init__(self, container, do_embed=False):
if self.first_letter_pat is None:
StatsCollector.first_letter_pat = self.first_letter_pat = regex.compile(
r'^[\p{Ps}\p{Ps}\p{Pe}\p{Pi}\p{Pf}\p{Po}]+', regex.VERSION1 | regex.UNICODE)
StatsCollector.capitalize_pat = self.capitalize_pat = regex.compile(
r'[\p{L}\p{N}]', regex.VERSION1 | regex.UNICODE)
self.collect_font_stats(container, do_embed)
_apostrophe_t = regex.compile('[' + _apostrophes + ']t')
"""Apostrophe-t regex, to detect "n't" suffixes."""
# about 25% of the runtime of the tokenizer is spent with this regex
_separation = regex.compile(
r"(?<=\p{Ll})[.!?]?(?=\p{Lu})|" + # lowercase-uppercase transitions
r"[" + _apostrophes + r"]\p{L}+|" + # apostrophes and their tail
r"[\p{Ps}\p{Pe}]|" + # parenthesis and open/close punctuation
r"\.\.\.|" + # inner ellipsis
r"(?<=\p{L})[,;_" + _hyphens + r"](?=[\p{L}\p{Nd}])|" + # dash-not-digits transition prefix
r"(?<=[\p{L}\p{Nd}])[,;_" + _hyphens + r"](?=\p{L})" # dash-not-digits transition postfix
"""Secondary regex to sub-split non-whitespace sequences."""
_spaces = regex.compile(r"\S+", regex.UNICODE)
"""Primary regex to split strings at any kind of Unicode whitespace."""
def join_hyphenated_words_across_linebreaks(text: str) -> str:
"""Join 'hyhen-\\n ated wor- \\nds' to 'hyphenated words'."""
return Tokenizer._hyphen_newline.subn("", text)[0]
def to_text(tokens: List[Token]) -> str:
Reconstruct the original text where the Tokens were found.
This works because a Token stores its spacing prefix.
return "".join(map(str, tokens))
def compile_pat(pat):
import regex
return regex.compile(pat, flags=REGEX_FLAGS)
def __init__(self, options={}):
Constructor (pre-compile all needed regexes).
# process options
self.lowercase = True if options.get('lowercase') else False
self.moses_escape = True if options.get('moses_escape') else False
# compile regexes
self.__spaces = Regex(r'\s+', flags=UNICODE)
self.__ascii_junk = Regex(r'[\000-\037]')
self.__special_chars = \
# single quotes: all unicode quotes + prime
self.__to_single_quotes = Regex(r'[`‛‚‘’‹›′]')
# double quotes: all unicode chars incl. Chinese + double prime + ditto
self.__to_double_quotes = Regex(r'(\'\'|``|[«»„‟“”″〃「」『』〝〞〟])')
self.__no_numbers = Regex(r'([^\p{N}])([,.])([^\p{N}])')
self.__pre_numbers = Regex(r'([^\p{N}])([,.])([\p{N}])')
self.__post_numbers = Regex(r'([\p{N}])([,.])([^\p{N}])')
# hyphen: separate every time but for unary minus
self.__minus = Regex(r'([-−])')
self.__pre_notnum = Regex(r'(-)([^\p{N}])')
self.__post_num_or_nospace = Regex(r'(\p{N} *|[^ ])(-)')
#for numerical codes
matches = re.findall("&#\d+;", tmp_str, flags=re.UNICODE)
if len(matches) > 0 :
hits = set(matches)
for hit in hits :
name = hit[2:-1]
try :
entnum = int(name)
tmp_str = tmp_str.replace(hit, unichr(entnum))
except ValueError:
#for hex codes
matches = re.findall("&#[xX][0-9a-fA-F]+;", tmp_str, flags=re.UNICODE)
if len(matches) > 0 :
hits = set(matches)
for hit in hits :
hex = hit[3:-1]
try :
entnum = int(hex, 16)
tmp_str = tmp_str.replace(hit, unichr(entnum))
except ValueError:
return tmp_str
super(Unicode, self).__init__()
self.encoding = encoding
self.normalize = normalize
if self.normalize:
# Compile the regex that we will use to remove non-
# printables from the resulting unicode.
# Note: using a double negative so that we can exclude
# newlines, which are technically considered control chars.
self.npr = regex.compile(r'[^\P{C}\s]+', regex.UNICODE)
sys.stderr = codecs.getwriter('utf-8')(sys.stderr)
if args.language == "german":
decompounder = get_decompounder()
decompounded_count = 0
for ln, line in enumerate(sys.stdin):
line = re.sub(r"[[:space:]]+", " ", line.rstrip())
line = re.sub(r"^[[:space:]]+", "", line)
line = re.sub(r"''", "\"", line)
line = re.sub(r"``", "\"", line)
line = re.sub(r"-([[:punct:]\$])", "\g<1>", line)
line = re.sub(r"([[:punct:]\$])-", "\g<1>", line)
line = re.sub(r"^[[:space:]]*-[[:space:]]", "", line)
line = re.sub(r"([[:alpha:]0-9ß])-([ [:punct:]])", "\g<1>\g<2>", line, re.UNICODE)
line = re.sub(r"([ [:punct:]])-([[:alpha:]0-9ß])", "\g<1>\g<2>", line, re.UNICODE)
line = re.sub(r" - ", " – ", line)
line = re.sub(r"– -", "–", line)
def normalize_quotes(token):
token = re.sub(r"-$", '', token)
token = re.sub(r"``", '\u201c', token)
token = re.sub(r"''", '\u201d', token)
return token
tokenized = [normalize_quotes(t) for t in word_tokenize(line, language=args.language)]
if args.language == "german":
for i, token in enumerate(tokenized):
decompounded_count += 1
decompounded = decompounder.splitWord(token)
def __init__(self, container, do_embed=False):
if self.first_letter_pat is None:
StatsCollector.first_letter_pat = self.first_letter_pat = regex.compile(
r'^[\p{Ps}\p{Ps}\p{Pe}\p{Pi}\p{Pf}\p{Po}]+', regex.VERSION1 | regex.UNICODE)
StatsCollector.capitalize_pat = self.capitalize_pat = regex.compile(
r'[\p{L}\p{N}]', regex.VERSION1 | regex.UNICODE)
self.collect_font_stats(container, do_embed)