Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_bleach_html_parser(parser_args, data, expected):
args = {
'tags': None,
'strip': True,
'consume_entities': True
}
args.update(parser_args)
# Build a parser, walker, and serializer just like we do in clean()
parser = html5lib_shim.BleachHTMLParser(**args)
walker = html5lib_shim.getTreeWalker('etree')
serializer = html5lib_shim.BleachHTMLSerializer(
quote_attr_values='always',
omit_optional_tags=False,
escape_lt_in_attrs=True,
resolve_entities=False,
sanitize=False,
alphabetical_attributes=False,
)
# Parse, walk, and then serialize the output
dom = parser.parseFragment(data)
serialized = serializer.render(walker(dom))
assert serialized == expected
def test_serializer(data, expected):
# Build a parser, walker, and serializer just like we do in clean()
parser = html5lib_shim.BleachHTMLParser(
tags=None,
strip=True,
consume_entities=False,
namespaceHTMLElements=False
)
walker = html5lib_shim.getTreeWalker('etree')
serializer = html5lib_shim.BleachHTMLSerializer(
quote_attr_values='always',
omit_optional_tags=False,
escape_lt_in_attrs=True,
resolve_entities=False,
sanitize=False,
alphabetical_attributes=False,
)
# Parse, walk, and then serialize the output
:arg list-of-strings recognized_tags: the list of tags that linkify knows about;
everything else gets escaped
:returns: linkified text as unicode
"""
self.callbacks = callbacks
self.skip_tags = skip_tags
self.parse_email = parse_email
self.url_re = url_re
self.email_re = email_re
# Create a parser/tokenizer that allows all HTML tags and escapes
# anything not in that list.
self.parser = html5lib_shim.BleachHTMLParser(
tags=recognized_tags,
strip=False,
consume_entities=True,
namespaceHTMLElements=False,
)
self.walker = html5lib_shim.getTreeWalker('etree')
self.serializer = html5lib_shim.BleachHTMLSerializer(
quote_attr_values='always',
omit_optional_tags=False,
# linkify does not sanitize
sanitize=False,
# linkify alphabetizes
alphabetical_attributes=False,
)
.. Warning::
Using filters changes the output of ``bleach.Cleaner.clean``.
Make sure the way the filters change the output are secure.
"""
self.tags = tags
self.attributes = attributes
self.styles = styles
self.protocols = protocols
self.strip = strip
self.strip_comments = strip_comments
self.filters = filters or []
self.parser = html5lib_shim.BleachHTMLParser(
tags=self.tags,
strip=self.strip,
consume_entities=False,
namespaceHTMLElements=False
)
self.walker = html5lib_shim.getTreeWalker('etree')
self.serializer = html5lib_shim.BleachHTMLSerializer(
quote_attr_values='always',
omit_optional_tags=False,
escape_lt_in_attrs=True,
# We want to leave entities as they are without escaping or
# resolving or expanding
resolve_entities=False,
# Bleach has its own sanitizer, so don't use the html5lib one
def __init__(self, tags, strip, consume_entities, **kwargs):
"""
:arg tags: list of allowed tags--everything else is either stripped or
escaped; if None, then this doesn't look at tags at all
:arg strip: whether to strip disallowed tags (True) or escape them (False);
if tags=None, then this doesn't have any effect
:arg consume_entities: whether to consume entities (default behavior) or
leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)
"""
self.tags = [tag.lower() for tag in tags] if tags is not None else None
self.strip = strip
self.consume_entities = consume_entities
super(BleachHTMLParser, self).__init__(**kwargs)