Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _parse(content):
parser = html5lib.liberalxmlparser.XMLParser(tree=treebuilders.getTreeBuilder('beautifulsoup'))
soup = parser.parseFragment(content)
tiddler_div = soup.find('div')
return tiddler_div
def _shim_wrap(self, html, file_type="index", prefs=None):
"""
Applies certain corrections to the HTML source passed to this method.
Specifically adds the relevant shim script, wraps all script text
within opera.isReady() methods etc. """
htmlparser = html5lib.HTMLParser(
tree=html5lib.treebuilders.getTreeBuilder("dom"))
domwalker = html5lib.treewalkers.getTreeWalker("dom")
serializer = html5lib.serializer.HTMLSerializer(
omit_optional_tags=False, quote_attr_values=True,
strip_whitespace=True, use_trailing_solidus=True)
doc = htmlparser.parse(html, "utf-8")
inlinescrdata = ""
nex = self._nex
# FIXME: use the correct base for the @src (mostly this is the root
# [''])
# Remove scripts only if we are merging all of them
def add_dom_prefs(doc, prefs):
""" Add an external script with the data taken from preference
elements in config.xml. Returns a tuple of doc, prefs script and
script src"""
if isinstance(prefs, dict):
def sanitize_html(html):
"""Sanitizes an HTML fragment."""
p = html5lib.HTMLParser(tokenizer=HTMLSanitizer,
tree=treebuilders.getTreeBuilder("dom"))
dom_tree = p.parseFragment(html)
walker = treewalkers.getTreeWalker("dom")
stream = walker(dom_tree)
s = serializer.HTMLSerializer(omit_optional_tags=False,
quote_attr_values=True)
output_generator = s.serialize(stream)
return u''.join(output_generator)
def __init__(self, element, soup, namespace):
html5lib.treebuilders._base.Node.__init__(self, element.name)
self.element = element
self.soup = soup
self.namespace = namespace
def html_parser(html):
try:
soup = BeautifulSoup(html)
except:
parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
soup = parser.parse(html)
return soup
def __init__(self, src=None, is_full_document=False):
self.tree = html5lib.treebuilders.getTreeBuilder("etree")
self.parser = html5lib.HTMLParser(tree=self.tree,
namespaceHTMLElements=False)
self._serializer = None
self._default_serializer_options = {
'omit_optional_tags': False,
'quote_attr_values': 'always',
'escape_lt_in_attrs': True,
'alphabetical_attributes': True,
}
self._serializer_options = None
self.walker = html5lib.treewalkers.getTreeWalker("etree")
self.src = ''
self.doc = None
def strip_tags(html):
if html:
builder = treebuilders.getTreeBuilder("dom")
parser = html5lib.HTMLParser(tree=builder, tokenizer=StripTags)
tree = parser.parseFragment(html)
walker = treewalkers.getTreeWalker("dom")
stream = walker(tree)
serializer = HTMLSerializer()
return serializer.render(stream)
def pisaParser(src, context, default_css="", xhtml=False, encoding=None, xml_output=None):
"""
- Parse HTML and get miniDOM
- Extract CSS informations, add default CSS, parse CSS
- Handle the document DOM itself and build reportlab story
- Return Context object
"""
global CSSAttrCache
CSSAttrCache = {}
if xhtml:
#TODO: XHTMLParser doesn't see to exist...
parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
else:
parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
if isinstance(src, six.text_type):
# If an encoding was provided, do not change it.
if not encoding:
encoding = "utf-8"
src = src.encode(encoding)
src = pisaTempFile(src, capacity=context.capacity)
# # Test for the restrictions of html5lib
# if encoding:
# # Workaround for html5lib<0.11.1
# if hasattr(inputstream, "isValidEncoding"):
# if encoding.strip().lower() == "utf8":
# encoding = "utf-8"
def pisaParser(src, context, default_css="", xhtml=False, encoding=None, xml_output=None):
"""
- Parse HTML and get miniDOM
- Extract CSS informations, add default CSS, parse CSS
- Handle the document DOM itself and build reportlab story
- Return Context object
"""
global CSSAttrCache
CSSAttrCache = {}
if xhtml:
#TODO: XHTMLParser doesn't see to exist...
parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
else:
parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
if isinstance(src, six.text_type):
# If an encoding was provided, do not change it.
if not encoding:
encoding = "utf-8"
src = src.encode(encoding)
src = pisaTempFile(src, capacity=context.capacity)
# # Test for the restrictions of html5lib
# if encoding:
# # Workaround for html5lib<0.11.1
# if hasattr(inputstream, "isValidEncoding"):
# if encoding.strip().lower() == "utf8":
# encoding = "utf-8"
# if not inputstream.isValidEncoding(encoding):
# log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding)