Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def stringify(value, encoding_default=DEFAULT_ENCODING, encoding=None):
"""Brute-force convert a given object to a string.
This will attempt an increasingly mean set of conversions to make a given
object into a unicode string. It is guaranteed to either return unicode or
None, if all conversions failed (or the value is indeed empty).
"""
if value is None:
return None
if not isinstance(value, six.text_type):
if isinstance(value, (date, datetime)):
return value.isoformat()
elif isinstance(value, (float, Decimal)):
return Decimal(value).to_eng_string()
elif isinstance(value, six.binary_type):
if encoding is None:
def sanitize_text(text, encoding=DEFAULT_ENCODING):
text = stringify(text, encoding_default=encoding)
if text is not None:
try:
text = compose_nfc(text)
except (SystemError, Exception) as ex:
log.warning("Cannot NFC text: %s", ex)
return None
text = remove_unsafe_chars(text)
text = text.encode(DEFAULT_ENCODING, "replace")
return text.decode(DEFAULT_ENCODING, "replace")
def sanitize_text(text, encoding=DEFAULT_ENCODING):
text = stringify(text, encoding_default=encoding)
if text is not None:
try:
text = compose_nfc(text)
except (SystemError, Exception) as ex:
log.warning("Cannot NFC text: %s", ex)
return None
text = remove_unsafe_chars(text)
text = text.encode(DEFAULT_ENCODING, "replace")
return text.decode(DEFAULT_ENCODING, "replace")
def normalize(text, lowercase=True, collapse=True, latinize=False, ascii=False,
encoding_default=DEFAULT_ENCODING, encoding=None,
replace_categories=UNICODE_CATEGORIES):
"""The main normalization function for text.
This will take a string and apply a set of transformations to it so
that it can be processed more easily afterwards. Arguments:
* ``lowercase``: not very mysterious.
* ``collapse``: replace multiple whitespace-like characters with a
single whitespace. This is especially useful with category replacement
which can lead to a lot of whitespace.
* ``decompose``: apply a unicode normalization (NFKD) to separate
simple characters and their diacritics.
* ``replace_categories``: This will perform a replacement of whole
classes of unicode characters (e.g. symbols, marks, numbers) with a
given character. It is used to replace any non-text elements of the
input string.
def sanitize_text(text, encoding=DEFAULT_ENCODING):
text = stringify(text, encoding_default=encoding)
if text is not None:
try:
text = compose_nfc(text)
except (SystemError, Exception) as ex:
log.warning("Cannot NFC text: %s", ex)
return None
text = remove_unsafe_chars(text)
text = text.encode(DEFAULT_ENCODING, "replace")
return text.decode(DEFAULT_ENCODING, "replace")