Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
attribute) pairs that contain URLs to be rewritten.
"""
tree_builder = html5lib.treebuilders.getTreeBuilder('dom')
parser = html5lib.html5parser.HTMLParser(tree = tree_builder)
dom = parser.parse(src)
for tag, attr in url_attributes:
for e in dom.getElementsByTagName(tag):
u = e.getAttribute(attr)
if u and not url_filter(urljoin(src_base, u)):
rewritten = urljoin(rewrite_base, u)
if u != rewritten:
e.setAttribute(attr, rewritten)
tree_walker = html5lib.treewalkers.getTreeWalker('dom')
html_serializer = html5lib.serializer.htmlserializer.HTMLSerializer()
return u''.join(html_serializer.serialize(tree_walker(dom)))
def sanitize_html(html):
"""Sanitizes an HTML fragment."""
p = html5lib.HTMLParser(tokenizer=HTMLSanitizer,
tree=treebuilders.getTreeBuilder("dom"))
dom_tree = p.parseFragment(html)
walker = treewalkers.getTreeWalker("dom")
stream = walker(dom_tree)
s = serializer.HTMLSerializer(omit_optional_tags=False,
quote_attr_values=True)
output_generator = s.serialize(stream)
return u''.join(output_generator)
def html2text(html, sanitize=False, ignore_br=False):
""" Takes utf-8 encoded page and returns unicode text """
p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
dom_tree = p.parse(html.decode("utf-8"))
walker = treewalkers.getTreeWalker("dom")
stream = walker(dom_tree)
space_introducing_tags = set(['th', 'td'])
# Add space around spans
# This technically violates the standard as spans
# don't introduce whitespace. In practice whitespace
# is often added via CSS and spans rarely end in the
# middle of a word.
space_introducing_tags.add('span')
line_break_tags = block_level_elements
line_break_tags.add('tr') # introduces line-break
line_break_tags.add('li') # <li> introduces line-break
line_break_tags.add('option') # <option> introduces line-break
if ignore_br:</option></li>
def to_unicode(self):
"""Return the unicode serialization of myself."""
container_len = len(self.CONTAINER_TAG) + 2 # 2 for the <>
walker = getTreeWalker(self.TREEBUILDER)
stream = walker(self._root)
serializer = HTMLSerializer(quote_attr_values=True,
omit_optional_tags=False)
return serializer.render(stream)[container_len:-container_len - 1]
"""
Return a slice of ``html`` <= length chars.
killwords and end are currently ignored.
ONLY USE FOR KNOWN-SAFE HTML.
"""
tree = html5lib.parseFragment(html)
if text_length(tree) <= length:
return jinja2.Markup(html)
else:
# Get a truncated version of the tree.
short, _ = trim(tree, length, killwords, end)
# Serialize the parsed tree back to html.
walker = html5lib.treewalkers.getTreeWalker('etree')
stream = walker(short)
serializer = html5lib.serializer.HTMLSerializer(
quote_attr_values='always', omit_optional_tags=False)
return jinja2.Markup(force_text(serializer.render(stream)))
ncss = dom.createElement("style")
ncss.setAttribute(u"type", u"text/css")
ncss.appendChild(dom.createTextNode(csstext))
css.parentNode.insertBefore(ncss, css)
css.parentNode.removeChild(css)
_embed_images(dom, rootdirs)
#Save out the new html file
with open(outfile, "w") as htmlfile:
# Fix error due to changes new version of html5lib (> 0.9999...)
try:
serializer = html5lib.serializer.htmlserializer.HTMLSerializer()
except AttributeError:
serializer = html5lib.serializer.HTMLSerializer()
walker = html5lib.treewalkers.getTreeWalker("dom")
for line in serializer.serialize(walker(dom)):
htmlfile.write(line)
document = [document]
for fragment in document:
print(parser.tree.testSerializer(fragment))
elif opts.hilite:
sys.stdout.write(document.hilite("utf-8"))
elif opts.html:
kwargs = {}
for opt in serializer.HTMLSerializer.options:
try:
kwargs[opt] = getattr(opts,opt)
except:
pass
if not kwargs['quote_char']:
del kwargs['quote_char']
tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
if sys.version_info[0] >= 3:
encoding = None
else:
encoding = "utf-8"
for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding):
sys.stdout.write(text)
if not text.endswith('\n'): sys.stdout.write('\n')
if opts.error:
errList=[]
for pos, errorcode, datavars in parser.errors:
errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n")
def camoify(ctx, value):
request = ctx.get("request") or get_current_request()
# Parse the rendered output and replace any inline images that don't point
# to HTTPS with camouflaged images.
tree_builder = html5lib.treebuilders.getTreeBuilder("dom")
parser = html5lib.html5parser.HTMLParser(tree=tree_builder)
dom = parser.parse(value)
for element in dom.getElementsByTagName("img"):
src = element.getAttribute("src")
if src:
element.setAttribute("src", request.camo_url(src))
tree_walker = html5lib.treewalkers.getTreeWalker("dom")
html_serializer = html5lib.serializer.HTMLSerializer()
camoed = "".join(html_serializer.serialize(tree_walker(dom)))
return camoed