How to use the regex.DOTALL function in regex

To help you get started, we’ve selected a few regex examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

facelessuser / Rummage / tests / test_rumcore.py View on Github

rc._regex_pattern(
                r"test",
                rc.DOTALL | rc.IGNORECASE | rc.MULTILINE | rc.WORD |
                rc.BESTMATCH | rc.ENHANCEMATCH | rc.REVERSE | rc.FULLCASE | rc.POSIX
            ).flags,
            regex.V0 | regex.ASCII | regex.DOTALL | regex.IGNORECASE | regex.MULTILINE |
            regex.WORD | regex.ENHANCEMATCH | regex.BESTMATCH | regex.REVERSE | regex.FULLCASE |
            regex.POSIX
        )
        self.assertEqual(
            rc._regex_pattern(
                r"test",
                rc.UNICODE | rc.DOTALL | rc.IGNORECASE | rc.MULTILINE | rc.FULLCASE |
                rc.WORD | rc.BESTMATCH | rc.ENHANCEMATCH | rc.REVERSE | rc.VERSION1 | rc.POSIX
            ).flags,
            regex.V1 | regex.UNICODE | regex.DOTALL | regex.IGNORECASE | regex.MULTILINE |
            regex.WORD | regex.ENHANCEMATCH | regex.BESTMATCH | regex.REVERSE | regex.FULLCASE |
            regex.POSIX
        )

aamini / introtodeeplearning_labs / lab1 / util.py View on Github

def extract_song_snippet(generated_text):
    pattern = '\n\n(.*?)\n\n'
    search_results = re.findall(pattern, generated_text, overlapped=True, flags=re.DOTALL)
    songs = [song for song in search_results]
    print "Found {} possible songs in generated texts".format(len(songs))
    return songs

berkmancenter / mediacloud-sentence-splitter / sentence_splitter / __init__.py View on Github

"Non-breaking prefix file for language '{}' was not found at path '{}'".format(
                    language,
                    non_breaking_prefix_file,
                ))

        self.__non_breaking_prefixes = dict()
        with open(non_breaking_prefix_file, mode='r', encoding='utf-8') as prefix_file:
            for line in prefix_file.readlines():

                if '#NUMERIC_ONLY#' in line:
                    prefix_type = SentenceSplitter.PrefixType.NUMERIC_ONLY
                else:
                    prefix_type = SentenceSplitter.PrefixType.DEFAULT

                # Remove comments
                line = regex.sub(pattern=r'#.*', repl='', string=line, flags=regex.DOTALL | regex.UNICODE)

                line = line.strip()

                if not line:
                    continue

                self.__non_breaking_prefixes[line] = prefix_type

lziad / WhitePhosphorus-bot / src / cs1language.py View on Github

import regex
from . import botsite
from .core import check, EditQueue
from .botsite import cur_timestamp, get_summary


# This module is called once an hour.
MAX_WORK_PER_HOUR = 50
LAST_SORT_KEY = None

tar_template = '[Cc]ite |[Cc]itation'
tar_para = 'language'
para_re = regex.compile(r'(?P{{\s*(%s)(?:(?!{{|}}).)*?(?P{{(?:(?!{{).)*?(?&amp;nest)?(?:(?!}}).)*?}})*'
                        '(?:(?!{{|}}).)*?\|\s*(%s)\s*=\s*)(?P.*?)'
                        '(?P\s*(\|\s*(?:(?!{{|}}).)*(?&amp;nest)*(?:(?!{{|}}).)*?)?}})' %
                        (tar_template, tar_para), regex.DOTALL)

sub_dict = {
    r'阿拉伯[语語文]|Arabic': 'ar',
    r'保加利亚[语文]|保加利亞[語文]|Bulgarian': 'bg',
    r'波斯尼亚[语文]|波士尼亚[語文]|Bosnian': 'bs',
    r'加泰罗尼亚[语文]|加泰羅尼亞[語文]|Catalan': 'ca',
    r'捷克[语語文]|Czech': 'cs',
    r'丹麦[语文]|丹麥[語文]|Danish': 'da',
    r'德[语語文]|Germany?|Deutsch|de-DE': 'de',
    r'希腊[语文]|希臘[語文]|Greek': 'el',
    r'英[语語文]|English|en-(UK|IN)|\[\[English language(\|English)?\]\]': 'en',
    r'西班牙[语語文]|Spanish|español|\[\[西班牙語(\|Spanish)?\]\]': 'es',
    r'爱沙尼亚[语文]|愛沙尼亞[語文]|Estonian': 'et',
    r'波斯[语語文]|Persian': 'fa',
    r'芬兰[语文]|芬蘭[語文]|Finnish': 'fi',
    r'法[语語文]|French|Français|fr-FR|\[\[French language(\|French)?\]\]|\{\{fr icon\}\}': 'fr',

fnl / segtok / segtok / segmenter.py View on Github

|   from
|   has
|   i(?: nto|s )
|   o[fr]
|   t(?: han|hat|hrough )
|   via
|   w(?: as|ere|hether|ith )
)\b""", UNICODE | VERBOSE)
"Lower-case words that in the given form usually don't start a sentence."

BEFORE_LOWER = compile(r""" .*?
(?: [%s]"[\)\]]*           # ."]) .") ."
|   [%s] [\)\]]+           # .]) .)
|   \b spp \.              # spp.  (species pluralis)
|   \b \p{L} \p{Ll}? \.    # Ll. L.
) \s+ $""" % (SENTENCE_TERMINALS, SENTENCE_TERMINALS), DOTALL | UNICODE | VERBOSE
)
"""
Endings that, if followed by a lower-case word, are not sentence terminals:
- Quotations and brackets ("Hello!" said the man.)
- dotted abbreviations (U.S.A. was)
- genus-species-like (m. musculus)
"""
LOWER_WORD = compile(r'^\p{Ll}+[%s]?\p{Ll}*\b' % HYPHENS, UNICODE)
"Lower-case words are not sentence starters (after an abbreviation)."

MIDDLE_INITIAL_END = compile(r'\b\p{Lu}\p{Ll}+\W+\p{Lu}$', UNICODE)
"Upper-case initial after upper-case word at the end of a string."

UPPER_WORD_START = compile(r'^\p{Lu}\p{Ll}+\b', UNICODE)
"Upper-case word at the beginning of a string."

skoczen / will / will / backends / generation / fuzzy_all_matches.py View on Github

if not hasattr(self, "cached_regex"):
            self.cached_regex = {}

        method_path = method_meta["plugin_info"]["parent_path"]
        if not method_path in self.cached_regex:

            regex_string = method_meta["regex_pattern"]
            if "case_sensitive" in method_meta and not method_meta["case_sensitive"]:
                regex_string = "(?i)%s" % regex_string

            if method_meta["multiline"]:
                try:
                    self.cached_regex[method_path] = regex.compile("%s{e&lt;=%s}" % (
                        regex_string,
                        settings.FUZZY_REGEX_ALLOWABLE_ERRORS
                    ), regex.MULTILINE | regex.DOTALL | regex.ENHANCEMATCH)
                except:
                    self.cached_regex[method_path] = regex.compile("%s{e&lt;=%s}" % (
                        regex.escape(regex_string),
                        settings.FUZZY_REGEX_ALLOWABLE_ERRORS
                    ), regex.MULTILINE | regex.DOTALL | regex.ENHANCEMATCH)
            else:
                try:
                    self.cached_regex[method_path] = regex.compile("%s{e&lt;=%s}" % (
                        regex_string,
                        settings.FUZZY_REGEX_ALLOWABLE_ERRORS
                    ), regex.ENHANCEMATCH)
                except:
                    self.cached_regex[method_path] = regex.compile("%s{e&lt;=%s}" % (
                        regex.escape(regex_string),
                        settings.FUZZY_REGEX_ALLOWABLE_ERRORS
                    ), regex.ENHANCEMATCH)

materialsintelligence / mat2vec / mat2vec / processing / process.py View on Github

"Scm−1", "Acm−1", "eV−1cm−2", "cm-2", "sccm", "cm−2eV−1", "cm−3eV−1",
                   "kA", "s−1", "emu", "L", "cmHz1", "gmol−1", "kVcm−1", "MPam1",
                   "cm2V−1s−1", "Acm−2", "cm−2s−1", "MV", "ionscm−2", "Jcm−2", "ncm−2",
                   "Jcm−2", "Wcm−2", "GWcm−2", "Acm−2K−2", "gcm−3", "cm3g−1", "mgl−1",
                   "mgml−1", "mgcm−2", "mΩcm", "cm−2s−1", "cm−2", "ions", "moll−1",
                   "nmol", "psi", "mol·L−1", "Jkg−1K−1", "km", "Wm−2", "mass", "mmHg",
                   "mmmin−1", "GeV", "m−2", "m−2s−1", "Kmin−1", "gL−1", "ng", "hr", "w",
                   "mN", "kN", "Mrad", "rad", "arcsec", "Ag−1", "dpa", "cdm−2",
                   "cd", "mcd", "mHz", "m−3", "ppm", "phr", "mL", "ML", "mlmin−1", "MWm−2",
                   "Wm−1K−1", "Wm−1K−1", "kWh", "Wkg−1", "Jm−3", "m-3", "gl−1", "A−1",
                   "Ks−1", "mgdm−3", "mms−1", "ks", "appm", "ºC", "HV", "kDa", "Da", "kG",
                   "kGy", "MGy", "Gy", "mGy", "Gbps", "μB", "μL", "μF", "nF", "pF", "mF",
                   "A", "Å", "A˚", "μgL−1"]

    NR_BASIC = regex.compile(r"^[+-]?\d*\.?\d+\(?\d*\)?+$", regex.DOTALL)
    NR_AND_UNIT = regex.compile(r"^([+-]?\d*\.?\d+\(?\d*\)?+)([\p{script=Latin}|Ω|μ]+.*)", regex.DOTALL)

    PUNCT = list(string.punctuation) + ["\"", "“", "”", "≥", "≤", "×"]

    def __init__(self, phraser_path=PHRASER_PATH):
        self.elem_name_dict = {en: es for en, es in zip(self.ELEMENT_NAMES, self.ELEMENTS)}
        self.phraser = Phraser.load(phraser_path)

    def tokenize(self, text, split_oxidation=True, keep_sentences=True):
        """Converts a string to a list tokens (words) using a modified chemdataextractor tokenizer.

        Adds a few fixes for inorganic materials science, such as splitting common units from numbers
        and splitting the valence state.

        Args:
            text: input text as a string
            split_oxidation: if True, will split the oxidation state from the element, e.g. iron(II)

microsoft / Recognizers-Text / Python / libraries / recognizers-number / recognizers_number / number / extractors.py View on Github

def generate_regexes(self, ignore_case: bool = False) -> List[Pattern]:
        definitions = self.get_definitions()
        options = regex.DOTALL | (regex.IGNORECASE if ignore_case else 0)
        return list(map(lambda d: RegExpUtility.get_safe_reg_exp(d, options),
                        definitions))

microsoft / presidio / presidio-analyzer / analyzer / predefined_recognizers / iban_recognizer.py View on Github

def __is_valid_format(iban):
        country_code = iban[:2]
        if country_code in regex_per_country:
            country_regex = regex_per_country[country_code]
            return country_regex and re.match(country_regex, iban,
                                              flags=re.DOTALL | re.MULTILINE)

        return False

materialsintelligence / matscholar / matscholar / process.py View on Github

'MΩ', 'Ω', 'kΩ', 'mΩ', 'mgL−1', 'moldm−3', 'm2', 'm3', 'cm-1', 'cm',
                   'Scm−1', 'Acm−1', 'eV−1cm−2', 'cm-2', 'sccm', 'cm−2eV−1', 'cm−3eV−1',
                   'kA', 's−1', 'emu', 'L', 'cmHz1', 'gmol−1', 'kVcm−1', 'MPam1',
                   'cm2V−1s−1', 'Acm−2', 'cm−2s−1', 'MV', 'ionscm−2', 'Jcm−2', 'ncm−2',
                   'Jcm−2', 'Wcm−2', 'GWcm−2', 'Acm−2K−2', 'gcm−3', 'cm3g−1', 'mgl−1',
                   'mgml−1', 'mgcm−2', 'mΩcm', 'cm−2s−1', 'cm−2', 'ions', 'moll−1',
                   'nmol', 'psi', 'mol·L−1', 'Jkg−1K−1', 'km', 'Wm−2', 'mass', 'mmHg',
                   'mmmin−1', 'GeV', 'm−2', 'm−2s−1', 'Kmin−1', 'gL−1', 'ng', 'hr', 'w',
                   'mN', 'kN', 'Mrad', 'rad', 'arcsec', 'Ag−1', 'dpa', 'cdm−2',
                   'cd', 'mcd', 'mHz', 'm−3', 'ppm', 'phr', 'mL', 'ML', 'mlmin−1', 'MWm−2',
                   'Wm−1K−1', 'Wm−1K−1', 'kWh', 'Wkg−1', 'Jm−3', 'm-3', 'gl−1', 'A−1',
                   'Ks−1', 'mgdm−3', 'mms−1', 'ks', 'appm', 'ºC', 'HV', 'kDa', 'Da', 'kG',
                   'kGy', 'MGy', 'Gy', 'mGy', 'Gbps', 'μB', 'μL', 'μF', 'nF', 'pF', 'mF',
                   'A', 'Å', 'A˚', "μgL−1"]

    NR_BASIC = regex.compile(r'^[+-]?\d*\.?\d+\(?\d*\)?+$', regex.DOTALL)
    NR_AND_UNIT = regex.compile(r'^([+-]?\d*\.?\d+\(?\d*\)?+)([\p{script=Latin}|Ω|μ]+.*)', regex.DOTALL)

    PUNCT = list(string.punctuation) + ['"', '“', '”', '≥', '≤', '×']

    def __init__(self, phraser_path=PHRASER_PATH):
        self.elem_name_dict = {en: es for en, es in zip(self.ELEMENT_NAMES, self.ELEMENTS)}
        self.phraser = Phraser.load(phraser_path)

    def tokenize(self, text, split_oxidation=True, keep_sentences=True):
        """
        Converts string to a list tokens (words) using chemdataextractor tokenizer, with a couple of fixes
        for inorganic materials science.
        Keeps the structure of sentences.
        :param text: input text as a string
        :param split_oxidation: if True, will split the oxidation state from the element, e.g. iron(II)
        will become iron (II), same with Fe(II), etc.

How to use the regex.DOTALL function in regex

To help you get started, we’ve selected a few regex examples, based on popular ways it is used in public projects.

regex

Package Health Score

Popular regex functions

Similar packages