Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
rc._regex_pattern(
r"test",
rc.DOTALL | rc.IGNORECASE | rc.MULTILINE | rc.WORD |
rc.BESTMATCH | rc.ENHANCEMATCH | rc.REVERSE | rc.FULLCASE | rc.POSIX
).flags,
regex.V0 | regex.ASCII | regex.DOTALL | regex.IGNORECASE | regex.MULTILINE |
regex.WORD | regex.ENHANCEMATCH | regex.BESTMATCH | regex.REVERSE | regex.FULLCASE |
regex.POSIX
)
self.assertEqual(
rc._regex_pattern(
r"test",
rc.UNICODE | rc.DOTALL | rc.IGNORECASE | rc.MULTILINE | rc.FULLCASE |
rc.WORD | rc.BESTMATCH | rc.ENHANCEMATCH | rc.REVERSE | rc.VERSION1 | rc.POSIX
).flags,
regex.V1 | regex.UNICODE | regex.DOTALL | regex.IGNORECASE | regex.MULTILINE |
regex.WORD | regex.ENHANCEMATCH | regex.BESTMATCH | regex.REVERSE | regex.FULLCASE |
regex.POSIX
)
def extract_song_snippet(generated_text):
pattern = '\n\n(.*?)\n\n'
search_results = re.findall(pattern, generated_text, overlapped=True, flags=re.DOTALL)
songs = [song for song in search_results]
print "Found {} possible songs in generated texts".format(len(songs))
return songs
"Non-breaking prefix file for language '{}' was not found at path '{}'".format(
language,
non_breaking_prefix_file,
))
self.__non_breaking_prefixes = dict()
with open(non_breaking_prefix_file, mode='r', encoding='utf-8') as prefix_file:
for line in prefix_file.readlines():
if '#NUMERIC_ONLY#' in line:
prefix_type = SentenceSplitter.PrefixType.NUMERIC_ONLY
else:
prefix_type = SentenceSplitter.PrefixType.DEFAULT
# Remove comments
line = regex.sub(pattern=r'#.*', repl='', string=line, flags=regex.DOTALL | regex.UNICODE)
line = line.strip()
if not line:
continue
self.__non_breaking_prefixes[line] = prefix_type
import regex
from . import botsite
from .core import check, EditQueue
from .botsite import cur_timestamp, get_summary
# This module is called once an hour.
MAX_WORK_PER_HOUR = 50
LAST_SORT_KEY = None
tar_template = '[Cc]ite |[Cc]itation'
tar_para = 'language'
para_re = regex.compile(r'(?P{{\s*(%s)(?:(?!{{|}}).)*?(?P{{(?:(?!{{).)*?(?&nest)?(?:(?!}}).)*?}})*'
'(?:(?!{{|}}).)*?\|\s*(%s)\s*=\s*)(?P.*?)'
'(?P\s*(\|\s*(?:(?!{{|}}).)*(?&nest)*(?:(?!{{|}}).)*?)?}})' %
(tar_template, tar_para), regex.DOTALL)
sub_dict = {
r'阿拉伯[语語文]|Arabic': 'ar',
r'保加利亚[语文]|保加利亞[語文]|Bulgarian': 'bg',
r'波斯尼亚[语文]|波士尼亚[語文]|Bosnian': 'bs',
r'加泰罗尼亚[语文]|加泰羅尼亞[語文]|Catalan': 'ca',
r'捷克[语語文]|Czech': 'cs',
r'丹麦[语文]|丹麥[語文]|Danish': 'da',
r'德[语語文]|Germany?|Deutsch|de-DE': 'de',
r'希腊[语文]|希臘[語文]|Greek': 'el',
r'英[语語文]|English|en-(UK|IN)|\[\[English language(\|English)?\]\]': 'en',
r'西班牙[语語文]|Spanish|español|\[\[西班牙語(\|Spanish)?\]\]': 'es',
r'爱沙尼亚[语文]|愛沙尼亞[語文]|Estonian': 'et',
r'波斯[语語文]|Persian': 'fa',
r'芬兰[语文]|芬蘭[語文]|Finnish': 'fi',
r'法[语語文]|French|Français|fr-FR|\[\[French language(\|French)?\]\]|\{\{fr icon\}\}': 'fr',
| from
| has
| i(?: nto|s )
| o[fr]
| t(?: han|hat|hrough )
| via
| w(?: as|ere|hether|ith )
)\b""", UNICODE | VERBOSE)
"Lower-case words that in the given form usually don't start a sentence."
BEFORE_LOWER = compile(r""" .*?
(?: [%s]"[\)\]]* # ."]) .") ."
| [%s] [\)\]]+ # .]) .)
| \b spp \. # spp. (species pluralis)
| \b \p{L} \p{Ll}? \. # Ll. L.
) \s+ $""" % (SENTENCE_TERMINALS, SENTENCE_TERMINALS), DOTALL | UNICODE | VERBOSE
)
"""
Endings that, if followed by a lower-case word, are not sentence terminals:
- Quotations and brackets ("Hello!" said the man.)
- dotted abbreviations (U.S.A. was)
- genus-species-like (m. musculus)
"""
LOWER_WORD = compile(r'^\p{Ll}+[%s]?\p{Ll}*\b' % HYPHENS, UNICODE)
"Lower-case words are not sentence starters (after an abbreviation)."
MIDDLE_INITIAL_END = compile(r'\b\p{Lu}\p{Ll}+\W+\p{Lu}$', UNICODE)
"Upper-case initial after upper-case word at the end of a string."
UPPER_WORD_START = compile(r'^\p{Lu}\p{Ll}+\b', UNICODE)
"Upper-case word at the beginning of a string."
if not hasattr(self, "cached_regex"):
self.cached_regex = {}
method_path = method_meta["plugin_info"]["parent_path"]
if not method_path in self.cached_regex:
regex_string = method_meta["regex_pattern"]
if "case_sensitive" in method_meta and not method_meta["case_sensitive"]:
regex_string = "(?i)%s" % regex_string
if method_meta["multiline"]:
try:
self.cached_regex[method_path] = regex.compile("%s{e<=%s}" % (
regex_string,
settings.FUZZY_REGEX_ALLOWABLE_ERRORS
), regex.MULTILINE | regex.DOTALL | regex.ENHANCEMATCH)
except:
self.cached_regex[method_path] = regex.compile("%s{e<=%s}" % (
regex.escape(regex_string),
settings.FUZZY_REGEX_ALLOWABLE_ERRORS
), regex.MULTILINE | regex.DOTALL | regex.ENHANCEMATCH)
else:
try:
self.cached_regex[method_path] = regex.compile("%s{e<=%s}" % (
regex_string,
settings.FUZZY_REGEX_ALLOWABLE_ERRORS
), regex.ENHANCEMATCH)
except:
self.cached_regex[method_path] = regex.compile("%s{e<=%s}" % (
regex.escape(regex_string),
settings.FUZZY_REGEX_ALLOWABLE_ERRORS
), regex.ENHANCEMATCH)
"Scm−1", "Acm−1", "eV−1cm−2", "cm-2", "sccm", "cm−2eV−1", "cm−3eV−1",
"kA", "s−1", "emu", "L", "cmHz1", "gmol−1", "kVcm−1", "MPam1",
"cm2V−1s−1", "Acm−2", "cm−2s−1", "MV", "ionscm−2", "Jcm−2", "ncm−2",
"Jcm−2", "Wcm−2", "GWcm−2", "Acm−2K−2", "gcm−3", "cm3g−1", "mgl−1",
"mgml−1", "mgcm−2", "mΩcm", "cm−2s−1", "cm−2", "ions", "moll−1",
"nmol", "psi", "mol·L−1", "Jkg−1K−1", "km", "Wm−2", "mass", "mmHg",
"mmmin−1", "GeV", "m−2", "m−2s−1", "Kmin−1", "gL−1", "ng", "hr", "w",
"mN", "kN", "Mrad", "rad", "arcsec", "Ag−1", "dpa", "cdm−2",
"cd", "mcd", "mHz", "m−3", "ppm", "phr", "mL", "ML", "mlmin−1", "MWm−2",
"Wm−1K−1", "Wm−1K−1", "kWh", "Wkg−1", "Jm−3", "m-3", "gl−1", "A−1",
"Ks−1", "mgdm−3", "mms−1", "ks", "appm", "ºC", "HV", "kDa", "Da", "kG",
"kGy", "MGy", "Gy", "mGy", "Gbps", "μB", "μL", "μF", "nF", "pF", "mF",
"A", "Å", "A˚", "μgL−1"]
NR_BASIC = regex.compile(r"^[+-]?\d*\.?\d+\(?\d*\)?+$", regex.DOTALL)
NR_AND_UNIT = regex.compile(r"^([+-]?\d*\.?\d+\(?\d*\)?+)([\p{script=Latin}|Ω|μ]+.*)", regex.DOTALL)
PUNCT = list(string.punctuation) + ["\"", "“", "”", "≥", "≤", "×"]
def __init__(self, phraser_path=PHRASER_PATH):
self.elem_name_dict = {en: es for en, es in zip(self.ELEMENT_NAMES, self.ELEMENTS)}
self.phraser = Phraser.load(phraser_path)
def tokenize(self, text, split_oxidation=True, keep_sentences=True):
"""Converts a string to a list tokens (words) using a modified chemdataextractor tokenizer.
Adds a few fixes for inorganic materials science, such as splitting common units from numbers
and splitting the valence state.
Args:
text: input text as a string
split_oxidation: if True, will split the oxidation state from the element, e.g. iron(II)
def generate_regexes(self, ignore_case: bool = False) -> List[Pattern]:
definitions = self.get_definitions()
options = regex.DOTALL | (regex.IGNORECASE if ignore_case else 0)
return list(map(lambda d: RegExpUtility.get_safe_reg_exp(d, options),
definitions))
def __is_valid_format(iban):
country_code = iban[:2]
if country_code in regex_per_country:
country_regex = regex_per_country[country_code]
return country_regex and re.match(country_regex, iban,
flags=re.DOTALL | re.MULTILINE)
return False
'MΩ', 'Ω', 'kΩ', 'mΩ', 'mgL−1', 'moldm−3', 'm2', 'm3', 'cm-1', 'cm',
'Scm−1', 'Acm−1', 'eV−1cm−2', 'cm-2', 'sccm', 'cm−2eV−1', 'cm−3eV−1',
'kA', 's−1', 'emu', 'L', 'cmHz1', 'gmol−1', 'kVcm−1', 'MPam1',
'cm2V−1s−1', 'Acm−2', 'cm−2s−1', 'MV', 'ionscm−2', 'Jcm−2', 'ncm−2',
'Jcm−2', 'Wcm−2', 'GWcm−2', 'Acm−2K−2', 'gcm−3', 'cm3g−1', 'mgl−1',
'mgml−1', 'mgcm−2', 'mΩcm', 'cm−2s−1', 'cm−2', 'ions', 'moll−1',
'nmol', 'psi', 'mol·L−1', 'Jkg−1K−1', 'km', 'Wm−2', 'mass', 'mmHg',
'mmmin−1', 'GeV', 'm−2', 'm−2s−1', 'Kmin−1', 'gL−1', 'ng', 'hr', 'w',
'mN', 'kN', 'Mrad', 'rad', 'arcsec', 'Ag−1', 'dpa', 'cdm−2',
'cd', 'mcd', 'mHz', 'm−3', 'ppm', 'phr', 'mL', 'ML', 'mlmin−1', 'MWm−2',
'Wm−1K−1', 'Wm−1K−1', 'kWh', 'Wkg−1', 'Jm−3', 'm-3', 'gl−1', 'A−1',
'Ks−1', 'mgdm−3', 'mms−1', 'ks', 'appm', 'ºC', 'HV', 'kDa', 'Da', 'kG',
'kGy', 'MGy', 'Gy', 'mGy', 'Gbps', 'μB', 'μL', 'μF', 'nF', 'pF', 'mF',
'A', 'Å', 'A˚', "μgL−1"]
NR_BASIC = regex.compile(r'^[+-]?\d*\.?\d+\(?\d*\)?+$', regex.DOTALL)
NR_AND_UNIT = regex.compile(r'^([+-]?\d*\.?\d+\(?\d*\)?+)([\p{script=Latin}|Ω|μ]+.*)', regex.DOTALL)
PUNCT = list(string.punctuation) + ['"', '“', '”', '≥', '≤', '×']
def __init__(self, phraser_path=PHRASER_PATH):
self.elem_name_dict = {en: es for en, es in zip(self.ELEMENT_NAMES, self.ELEMENTS)}
self.phraser = Phraser.load(phraser_path)
def tokenize(self, text, split_oxidation=True, keep_sentences=True):
"""
Converts string to a list tokens (words) using chemdataextractor tokenizer, with a couple of fixes
for inorganic materials science.
Keeps the structure of sentences.
:param text: input text as a string
:param split_oxidation: if True, will split the oxidation state from the element, e.g. iron(II)
will become iron (II), same with Fe(II), etc.