Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
from __future__ import unicode_literals
import regex as re
from datetime import datetime
from datetime import time
from tzlocal import get_localzone
from dateutil.relativedelta import relativedelta
from dateparser.utils import apply_timezone, localize_timezone, strip_braces
from .parser import time_parser
from .timezone_parser import pop_tz_offset_from_string
_UNITS = r'year|month|week|day|hour|minute|second'
PATTERN = re.compile(r'(\d+)\s*(%s)\b' % _UNITS, re.I | re.S | re.U)
class FreshnessDateDataParser(object):
""" Parses date string like "1 year, 2 months ago" and "3 hours, 50 minutes ago" """
def __init__(self):
self.now = None
def _are_all_words_units(self, date_string):
skip = [_UNITS,
r'ago|in|\d+',
r':|[ap]m']
date_string = re.sub(r'\s+', ' ', date_string.strip())
words = filter(lambda x: x if x else False, re.split(r'\W', date_string))
words = filter(lambda x: not re.match(r'%s' % '|'.join(skip), x), words)
# coding: utf8
"""Space utils"""
import regex as re
from urduhack.urdu_characters import URDU_ALL_CHARACTERS, URDU_PUNCTUATIONS
# Add spaces before|after numeric number and urdu words
# 18سالہ , 20فیصد
SPACE_BEFORE_DIGITS_RE = re.compile(r"(?<=[" + "".join(URDU_ALL_CHARACTERS) + "])(?=[0-9])", flags=re.U | re.M | re.I)
SPACE_AFTER_DIGITS_RE = re.compile(r"(?<=[0-9])(?=[" + "".join(URDU_ALL_CHARACTERS) + "])", flags=re.U | re.M | re.I)
# Add spaces after ., if there is number then not Ex (9.00)
SPACE_AFTER_PUNCTUATIONS_RE = re.compile(
r"(?<=[" + "".join(URDU_PUNCTUATIONS) + "])(?=[^" + "".join(URDU_PUNCTUATIONS) + "0-9 ])",
flags=re.U | re.M | re.I)
def digits_space(text: str) -> str:
"""
Add spaces before|after numeric and urdu digits
Args:
text (str): text
Returns:
str
"""
text = SPACE_BEFORE_DIGITS_RE.sub(' ', text)
text = SPACE_AFTER_DIGITS_RE.sub(' ', text)
return text
APOSTROPHE_LOOK_ALIKE_CHARS = [
u'\N{RIGHT SINGLE QUOTATION MARK}', # u'\u2019'
u'\N{MODIFIER LETTER APOSTROPHE}', # u'\u02bc'
u'\N{MODIFIER LETTER TURNED COMMA}', # u'\u02bb'
u'\N{ARMENIAN APOSTROPHE}', # u'\u055a'
u'\N{LATIN SMALL LETTER SALTILLO}', # u'\ua78c'
u'\N{PRIME}', # u'\u2032'
u'\N{REVERSED PRIME}', # u'\u2035'
u'\N{MODIFIER LETTER PRIME}', # u'\u02b9'
u'\N{FULLWIDTH APOSTROPHE}', # u'\uff07'
]
DATE_ORDER_PATTERN = re.compile(u'([DMY])+\u200f*[-/. \t]*([DMY])+\u200f*[-/. \t]*([DMY])+')
RELATIVE_PATTERN = re.compile(r'(?
import logging
import ipaddress
import email.utils
import regex
import synapse.exc as s_exc
import synapse.common as s_common
import synapse.lib.chop as s_chop
import synapse.lib.types as s_types
import synapse.lib.scrape as s_scrape
import synapse.lib.module as s_module
import synapse.lookup.iana as s_l_iana
logger = logging.getLogger(__name__)
fqdnre = regex.compile(r'^[\w._-]+$', regex.U)
srv6re = regex.compile(r'^\[([a-f0-9:]+)\]:(\d+)$')
cidrmasks = [((0xffffffff - (2 ** (32 - i) - 1)), (2 ** (32 - i))) for i in range(33)]
def getAddrType(ip):
if ip.is_multicast:
return 'multicast'
if ip.is_loopback:
return 'loopback'
if ip.is_link_local:
return 'linklocal'
def split_by_regex(self, regex_or_pattern, flags=re.U, gaps=True):
"""Split the text into multiple instances using a regex.
Parameters
----------
regex_or_pattern: str or compiled pattern
The regular expression to use for splitting.
flags: int (default: re.U)
The regular expression flags (only used, when user has not supplied compiled regex).
gaps: boolean (default: True)
If True, then regions matched by the regex are not included in the resulting Text instances, which
is expected behaviour.
If False, then only regions matched by the regex are included in the result.
Returns
-------
list of Text
def _get_simplifications(self, settings=None):
no_word_spacing = eval(self.info.get('no_word_spacing', 'False'))
if settings.NORMALIZE:
if self._normalized_simplifications is None:
self._normalized_simplifications = []
simplifications = self._generate_simplifications(normalize=True)
for simplification in simplifications:
pattern, replacement = list(simplification.items())[0]
if not no_word_spacing:
pattern = r'(?<=\A|\W|_)%s(?=\Z|\W|_)' % pattern
pattern = re.compile(pattern, flags=re.I | re.U)
self._normalized_simplifications.append({pattern: replacement})
return self._normalized_simplifications
else:
if self._simplifications is None:
self._simplifications = []
simplifications = self._generate_simplifications(normalize=False)
for simplification in simplifications:
pattern, replacement = list(simplification.items())[0]
if not no_word_spacing:
pattern = r'(?<=\A|\W|_)%s(?=\Z|\W|_)' % pattern
pattern = re.compile(pattern, flags=re.I | re.U)
self._simplifications.append({pattern: replacement})
return self._simplifications
def __init__(self, arpabet='arpabet', ligatures=False, cedict_file=None):
"""Construct a Flite "wrapper"
Args:
arpabet (str): file containing ARPAbet to IPA mapping
ligatures (bool): if True, use non-standard ligatures instead of
standard IPA
cedict_filename (str): path to CC-CEDict dictionary (included for
compatibility)
"""
arpabet = pkg_resources.resource_filename(__name__, os.path.join('data', arpabet + '.csv'))
self.arpa_map = self._read_arpabet(arpabet)
self.chunk_re = re.compile(r"([A-Za-z'’]+|[^A-Za-z'’]+)", re.U)
self.letter_re = re.compile(r"[A-Za-z'’]+")
self.regexp = re.compile(r'[A-Za-z]')
self.puncnorm = PuncNorm()
self.ligatures = ligatures
self.ft = panphon.FeatureTable()
self.num_panphon_fts = len(self.ft.names)
def test_diff_char(self):
# check that confusables_data doesn't propose the same character
import regex
wrong = 0
for group in confusables_data.confusables.values():
proposals = confusables_data.confusables_fix.get(group)
for (script, prop) in proposals.items():
re = regex.compile(r"[\p{%s}]" % script, flags=regex.V1 | regex.U)
if re.match(prop):
pass
elif group == prop:
wrong += 1
print("group=%s, script=%s, prop=%s" % (group, script, prop))
assert wrong == 0
def do_query(issues, interesting_words_file, _):
"""
Get the words which appear together in articles.
"""
# Get the list of words to search for
interesting_words = [re.compile(r'\b' + word.strip() + r'\b', re.I | re.U)
for word in list(open(interesting_words_file))]
# Map each article in each issue to a year of publication
articles = issues.flatMap(lambda issue: [(issue.date.year,
article) for
article in
issue.articles])
# Find the words for each article
interest = articles.flatMap(make_search(interesting_words))
# Now add sum the year-word counts, and change the format for output
interesting_by_year = interest \
.reduceByKey(add) \
.map(split_key) \
.groupByKey() \
.map(snd_to_list) \
.collect()
return interesting_by_year
from six.moves import zip_longest
from dateparser.utils import normalize_unicode
PARSER_HARDCODED_TOKENS = [":", ".", " ", "-", "/"]
PARSER_KNOWN_TOKENS = ["am", "pm", "UTC", "GMT", "Z"]
ALWAYS_KEEP_TOKENS = ["+"] + PARSER_HARDCODED_TOKENS
KNOWN_WORD_TOKENS = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday',
'saturday', 'sunday', 'january', 'february', 'march',
'april', 'may', 'june', 'july', 'august', 'september',
'october', 'november', 'december', 'year', 'month', 'week',
'day', 'hour', 'minute', 'second', 'ago', 'in', 'am', 'pm']
PARENTHESES_PATTERN = re.compile(r'[\(\)]')
NUMERAL_PATTERN = re.compile(r'(\d+)')
KEEP_TOKEN_PATTERN = re.compile(r"^.*[^\W_].*$", flags=re.U)
class UnknownTokenError(Exception):
pass
class Dictionary(object):
"""
Class that modifies and stores translations and handles splitting of date string.
:param locale_info:
Locale info (translation data) of the locale.
:type language_info: dict
:param settings:
Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`.