Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def text_blocks(html_text, skip_pre=True):
"""
Extracts a list of paragraphs from a given HTML string
"""
doc = html5lib.parseFragment(html_text)
blocks = []
def subloop(parent_tag, element, lastchild=False):
if callable(
element.tag
): # Comments have a callable tag. TODO: Find out, anything else?
tag = ''
text = ''
tail = element.tail or u''
else:
tag = element.tag.split('}')[
-1
] # Extract tag from namespace: {http://www.w3.org/1999/xhtml}html
text = element.text or u''
tail = element.tail or u''
def run(self, text):
parsed = html5lib.parseFragment(text)
# if we didn't have to customize our sanitization, could just do:
# return html5lib.serialize(parsed, sanitize=True)
# instead we do the same steps as that function,
# but add our ForgeHTMLSanitizerFilter instead of sanitize=True which would use the standard one
TreeWalker = html5lib.treewalkers.getTreeWalker("etree")
walker = TreeWalker(parsed)
walker = ForgeHTMLSanitizerFilter(walker) # this is our custom step
s = html5lib.serializer.HTMLSerializer()
return s.render(walker)
def find_iter(skeleton, document):
"""
Return an iterator that yields elements from the document that
match given skeleton.
See `find_all` for details.
"""
if is_string(document):
document = html5lib.parse(document)
if is_string(skeleton):
fragment = html5lib.parseFragment(skeleton)
if len(fragment) != 1:
raise ValueError("Skeleton must have exactly one root element.")
skeleton = fragment[0]
for element in document.iter():
if node_matches_bone(element, skeleton):
yield element
def parse_comments(self, root, raw):
try:
from urllib.parse import unquote
except ImportError:
from urllib import unquote
ans = ''
ns = tuple(self.selector('#bookDescription_feature_div noscript'))
if ns:
ns = ns[0]
if len(ns) == 0 and ns.text:
import html5lib
# html5lib parsed noscript as CDATA
ns = html5lib.parseFragment(
'<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0]
else:
ns.tag = 'div'
ans = self._render_comments(ns)
else:
desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]')
if desc:
ans = self._render_comments(desc[0])
desc = root.xpath(
'//div[@id="productDescription"]/*[@class="content"]')
if desc:
ans += self._render_comments(desc[0])
else:
# Idiot chickens from amazon strike again. This data is now stored
# in a JS variable inside a script tag URL encoded.
tree.text = tree.text.rstrip('\n')
# Remove the first new line after a block level element.
if tree.tail and tree.tail.startswith('\n'):
tree.tail = tree.tail[1:]
for child in tree: # Recurse down the tree.
if tree.tag in html_blocks:
# Strip new lines directly inside block level elements: remove
# the last new lines from the children's tails.
if child.tail:
child.tail = child.tail.rstrip('\n')
parse_html(child)
return tree
parse = parse_html(html5lib.parseFragment(string))
# Serialize the parsed tree back to html.
walker = html5lib.treewalkers.getTreeWalker('etree')
stream = walker(parse)
serializer = HTMLSerializer(quote_attr_values='always',
omit_optional_tags=False)
return serializer.render(stream)
def parse_comments(self, root, raw):
try:
from urllib.parse import unquote
except ImportError:
from urllib import unquote
ans = ''
ns = tuple(self.selector('#bookDescription_feature_div noscript'))
if ns:
ns = ns[0]
if len(ns) == 0 and ns.text:
import html5lib
# html5lib parsed noscript as CDATA
ns = html5lib.parseFragment(
'<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0]
else:
ns.tag = 'div'
ans = self._render_comments(ns)
else:
desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]')
if desc:
ans = self._render_comments(desc[0])
desc = root.xpath(
'//div[@id="productDescription"]/*[@class="content"]')
if desc:
ans += self._render_comments(desc[0])
else:
# Idiot chickens from amazon strike again. This data is now stored
# in a JS variable inside a script tag URL encoded.
def _ProcessString(content):
content = linkify.LinkifyWebUrls(content)
return html5lib.parseFragment(content, treebuilder='dom')
def truncate(html, length, killwords=False, end='...'):
"""
Return a slice of ``html`` <= length chars.
killwords and end are currently ignored.
ONLY USE FOR KNOWN-SAFE HTML.
"""
tree = html5lib.parseFragment(html)
if text_length(tree) <= length:
return jinja2.Markup(html)
else:
# Get a truncated version of the tree.
short, _ = trim(tree, length, killwords, end)
# Serialize the parsed tree back to html.
walker = html5lib.treewalkers.getTreeWalker('etree')
stream = walker(short)
serializer = html5lib.serializer.htmlserializer.HTMLSerializer(
quote_attr_values=True, omit_optional_tags=False)
return jinja2.Markup(force_unicode(serializer.render(stream)))
node.remove(div_children[0])
for key in node.attrib.keys():
if key.startswith('xmlns:') or \
key in ('id', 'class', 'style'):
del node.attrib[key]
for child in node.getchildren():
new_child = transform(child)
if new_child != child:
if new_child is not None:
child.addnext(new_child)
node.remove(child)
return node
prefix, root_node = html5lib.parseFragment(text, treebuilder='lxml')
node = transform(root_node)
node.attrib.clear()
node.tail = None
walker = html5lib.treewalkers.getTreeWalker('lxml')
stream = walker(node)
serializer = html5lib.serializer.htmlserializer.HTMLSerializer(omit_optional_tags=True)
output_generator = serializer.serialize(stream)
return u''.join(output_generator)