Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def foo():
source = u"<p>今日は html5lib!"
# doc = html5lib.parse(source, treebuilder="lxml")
doc = html.document_fromstring(source)
# print type(doc), repr(doc), str(doc), dir(doc)
assert 2 + 2 == 5
</p>
def get_unicode_root(fd):
content = fd.read()
doc = UnicodeDammit(content, is_html=True)
parser = html.HTMLParser(encoding=doc.original_encoding)
root = html.document_fromstring(content, parser=parser)
return root
def harvest_latest():
"""Function retrieves latest snapshot from live CC site, uses xpath to
save portions of the site to cache."""
try:
cc_home = urllib2.urlopen(CC_URL).read()
except urllib2.HTTPError, e:
logging.error("Unable to open CC_URL of %s" % CC_URL)
cc_tree = lxml.html.document_fromstring(cc_home)
cc_tree.make_links_absolute(CC_URL)
script_elements = cc_tree.xpath('//script')
js_html = ''
for script in script_elements:
if script.text is None:
js_html += lxml.html.tostring(script)
cache.set('cc-scripts',js_html)
header = cc_tree.xpath('//div[@id="header-wrapper"]')[0]
cache.set('cc-header',lxml.html.tostring(header))
footer = cc_tree.xpath('//footer[@id="footer"]')[0]
# Bug with harvested footer from library website, removing
# for IE
footer_scripts = footer.xpath('script')
for script in footer_scripts:
footer.remove(script)
cache.set('cc-footer',lxml.html.tostring(footer))
def get_manga_name(url, get=None):
global manga_name
if len(manga_name):
return manga_name.split('-', 1)[1]
if re.search('/online/[^/]+', url):
url = document_fromstring(get(url)).cssselect('.postload a.a-series-title.manga-title')[0].get('href')
name = re.search('/[^/]+/(\d+)\-([^/]+)\\.html', url)
if not name:
raise UrlParseError()
groups = name.groups()
manga_name = '{}-{}'.format(groups[0], groups[1])
return groups[1]
def page_html(self):
return html.document_fromstring(self.response.text)
def get_images(main_content=None, volume=None, get=None, post=None):
content = get(volume)
parser = document_fromstring(content)
select = parser.cssselect('.mid .pager select[name="page"]')[0]
images = []
_img = __images_helper(parser, volume)
items = select.cssselect('option + option')
if _img:
images.append(_img)
for i in items:
page = document_fromstring(get('{}{}/'.format(volume, i.get('value'))))
_img = __images_helper(page, volume)
if _img:
images.append(_img)
return images
def import_stage(self, harvest_object):
log.debug('In DataPublicaHarvester import_stage')
if not harvest_object:
log.error('No harvest object received')
return False
if harvest_object.content is None:
self._save_object_error('Empty content for object %s' % harvest_object.id,harvest_object,'Import')
return False
try:
package_dict = {}
extras_dict = {}
package_dict['id'] = harvest_object.guid
doc = html.document_fromstring(harvest_object.content)
for field in doc.findall(".//div"):
if not 'field' in field.get('class', ''): continue
name = field.find("label").text.strip()
if name == 'Title':
package_dict['title'] = field.find("div").xpath("string()").strip()
if name == 'Categories':
extras_dict['categories'] = []
for elem in field.findall("div[@class='input']"):
if not elem.text: continue
extras_dict['categories'].append(elem.text.strip())
if name == 'Software Licence':
#TODO: what to do with these?
a = field.find("div/a")
def __init__(self, server, book, chapter_name, html, use_cache=False,
cache_dir=None):
self.server = server
self.book = book
self.name = chapter_name
self.use_cache = use_cache
if cache_dir:
self.image_cache = ImageCache(cache_dir)
self.tree = lxml.html.document_fromstring(html)
def clean_html(cls, html, encoding=None):
parser = lxml.html.HTMLParser(encoding=encoding)
if isinstance(html, unicode) and encoding is not None:
html = html.encode(encoding)
html = lxml.html.document_fromstring(html, parser=parser)
return _cleaner.clean_html(html)
def get_volumes(content=None, url=None, get=None, post=None):
items = document_fromstring(content).cssselect('.chapters_list a[href*="/reader#"]')
return [i.get('href') for i in items]