Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_infinite_loop_catch(self):
"""Test infinite loop catch."""
with pytest.raises(_bregex_parse.LoopException):
bregex.compile_search(r'(?-x:(?x))', regex.V0 | regex.VERBOSE)
with pytest.raises(_bregex_parse.LoopException):
bregex.compile_search(r'(?V1)(?V0)')
# 202B), l-t-r/r-t-l override (202D, 202E), pop directional
# formatting (202C), zero-width no-break space (FEFF)
self.other_nasties = re.compile(r"[\u00AD\u061C\u200B-\u200F\u202A-\u202E\u2060\u2066-\u2069\uFEFF]")
# TAGS, EMAILS, URLs
self.xml_declaration = re.compile(r"""<\?xml
(?: # This group permits zero or more attributes
\s+ # Whitespace to separate attributes
[_:A-Z][-.:\w]* # Attribute name
\s*=\s* # Attribute name-value delimiter
(?: "[^"]*" # Double-quoted attribute value
| '[^']*' # Single-quoted attribute value
)
)*
\s* # Permit trailing whitespace
\?>""", re.VERBOSE | re.IGNORECASE)
# self.tag = re.compile(r'<(?!-)(?:/[^> ]+|[^>]+/?)(?')
# taken from Regular Expressions Cookbook
self.tag = re.compile(r"""
<
(?: # Branch for opening tags:
([_:A-Z][-.:\w]*) # Capture the opening tag name to backreference 1
(?: # This group permits zero or more attributes
\s+ # Whitespace to separate attributes
[_:A-Z][-.:\w]* # Attribute name
\s*=\s* # Attribute name-value delimiter
(?: "[^"]*" # Double-quoted attribute value
| '[^']*' # Single-quoted attribute value
)
)*
\s* # Permit trailing whitespace
/? # Permit self-closed tags
)
(?: # non capturing optional group for value
: # match :
( # 2nd capturing group: default value
(?: # non capturing group for OR
[^{}] # any non bracket
| # OR
\{ # literal {
(?2) # recursive 2nd capturing group aka ([^{}]|{(?2)})
\} # literal }
)* #
)
)?
\} # end of macher }
""",
regex.VERBOSE
)
IMPLICIT_ENV_VAR_MATCHER = re.compile(
r"""
.* # matches any number of any characters
\$\{.*\} # matches any number of any characters
# between `${` and `}` literally
.* # matches any number of any characters
""", re.VERBOSE
)
def setup_parser():
parser = argparse.ArgumentParser()
parser.add_argument(
'-v',
node = self.get_title_node(title, lang)
if bracket:
#look behind for opening brace ({, and ahead for closing brace })
re_string = ur"""(?<= # look behind for opening brace
[({] # literal '(', brace,
[^})]* # anything but a closing ) or brace
)
""" + title + node.delimiter_re + node.regex(lang) + ur"""
(?= # look ahead for closing brace
[^({]* # match of anything but an opening '(' or brace
[)}] # zero-width: literal ')' or brace
)"""
else:
re_string = '^' + title + node.delimiter_re + node.regex(lang)
return regex.compile(re_string, regex.VERBOSE) # Uses regex instead of re2 for the more intricate regexes at this stage.
| [Gg]eneral
| [Mm](?:ag)?is(?:ter|s)
| [Pp]rofessor
| [Ss]e\u00F1or(?:it)?a?
) \s
# 4.b. if they are most likely part of an author list: (avoiding "...A and B")
| (?: (?10%):
# after, though, upon, while, yet
#
# Words hardly used after abbrevs vs. SSs (poor continuations, <2%):
# [after], as, at, but, during, for, in, nor, on, to, [though], [upon],
# whereas, [while], within, [yet]
(?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
|
(?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
|
(?:[\w_]+) # Words without apostrophes or dashes.
|
(?:\.(?:\s*\.){1,}) # Ellipsis dots.
|
(?:\S) # Everything else that isn't whitespace.
""",
)
######################################################################
# This is the core tokenizing regex:
WORD_RE = regex.compile(r"""(%s)""" % "|".join(REGEXPS), regex.VERBOSE | regex.I | regex.UNICODE)
# WORD_RE performs poorly on these patterns:
HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}")
# The emoticon string gets its own regex so that we can preserve case for
# them as needed:
EMOTICON_RE = regex.compile(EMOTICONS, regex.VERBOSE | regex.I | regex.UNICODE)
# These are for regularizing HTML entities to Unicode:
ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);")
######################################################################
# Functions for converting html entities
######################################################################
# Any number of defang characters.
(?:
\x20|
""" + SEPARATOR_DEFANGS + r"""
)*
# Domain/path characters.
\w
\S+?
# CISCO ESA style defangs followed by domain/path characters.
(?:\x20[\/\.][^\.\/\s]\S*?)*
)
""" + END_PUNCTUATION + r"""
(?=\s|$)
""", re.IGNORECASE | re.VERBOSE | re.UNICODE)
# Get some obfuscated urls, main anchor is brackets around the period.
BRACKET_URL_RE = re.compile(r"""
\b
(
[\.\:\/\\\w\[\]\(\)-]+
(?:
\x20?
[\(\[]
\x20?
\.
\x20?
[\]\)]
\x20?
\S*?
)+
F = regex.F
FULLCASE = regex.FULLCASE
I = regex.I
IGNORECASE = regex.IGNORECASE
L = regex.L
LOCALE = regex.LOCALE
M = regex.M
MULTILINE = regex.MULTILINE
R = regex.R
REVERSE = regex.REVERSE
S = regex.S
DOTALL = regex.DOTALL
U = regex.U
UNICODE = regex.UNICODE
X = regex.X
VERBOSE = regex.VERBOSE
V0 = regex.V0
VERSION0 = regex.VERSION0
V1 = regex.V1
VERSION1 = regex.VERSION1
W = regex.W
WORD = regex.WORD
P = regex.P
POSIX = regex.POSIX
DEFAULT_VERSION = regex.DEFAULT_VERSION
REGEX_TYPE = type(regex.compile('', 0))
escape = regex.escape
purge = regex.purge
utokens = {
"regex_flags": re.compile(
r'(?s)(\\.)|\(\?((?:[Laberuxp]|V0|V1|-?[imsfw])+)[):]|(.)'
)
\)
''', regex.VERBOSE)
# This regex matches Markdown formatting such as _italic_, **bold**, or
# ~strikethrough~, and extracts the text inside it as \2.
MARKDOWN_FORMAT_RES = [
regex.compile(rf"""
(?
def match_decorator(fn):
automaton = compile(regex, UNICODE | VERBOSE)
fn.split = automaton.split
fn.match = automaton.match
return fn