Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def get_html(self, response):
data = response.get_data()
parser = etree.HTMLParser()
return etree.parse(six.StringIO(str(data)), parser)
def get_html(self, response):
data = response.get_data()
parser = etree.HTMLParser()
return etree.parse(StringIO(response.get_data()), parser)
def download_wechat_keyword_topics(self, word, process_topic):
""" 在关键词下的文章列表页面,逐一点击打开每一篇文章,并爬取 """
browser = self.browser
js = """ return document.documentElement.innerHTML; """
body = browser.execute_script(js)
htmlparser = etree.HTMLParser()
tree = etree.parse(StringIO(body), htmlparser)
elems = [stringify_children(item).replace('red_beg', '').replace('red_end', '') for item in tree.xpath("//div[@class='txt-box']/h3/a")]
hrefs = tree.xpath("//div[@class='txt-box']/h3/a/@href")
#avatars = tree.xpath("//div[@class='img-box']/a/img/@src")
#elems_abstracts = tree.xpath("//div[@class='txt-box']/p")
#abstracts = [item.text.strip() if item.text else '' for item in elems_abstracts]
avatars = [''] * len(elems)
abstracts = [''] * len(elems)
links = []
for idx, item in enumerate(elems):
title = item
print title
if not title:
continue
uniqueid = get_uniqueid('%s:%s' % (word, title))
def parseHtml(html):
parser = etree.HTMLParser(encoding='utf8')
tree = etree.parse(StringIO.StringIO(html), parser)
return tree
query = self.create_query(title=title, authors=authors)
if not query:
self.log('Insufficient metadata to construct query')
return
br = self.browser
try:
self.log('download page search %s'%query)
raw = br.open(query, timeout=timeout).read().strip()
except Exception as e:
self.log.exception('Failed to make identify query: %r'%query)
return as_unicode(e)
try:
parser = etree.HTMLParser()
clean = clean_ascii_chars(raw)
feed = fromstring(clean, parser=parser)
# if len(parser.error_log) > 0: #some errors while parsing
# self.log('while parsing page occus some errors:')
# self.log(parser.error_log)
more_pages = pages_count(feed)
#more pages with search results
que = Queue()
if ident is not None:
que.put([ident, title, authors])
if len(more_pages) > 0:
page_max = int(re.search("\d+", more_pages[0]).group()[-1])
else:
page_max = 1
formUrl = feed.xpath('//form[@id="form"]/@action')
self.log('formUrl %s'%formUrl[0])
url = self.BASE_URL + formUrl[0]
parameters = {
"sendform":"1",
"login_name":self.prefs['login'],
"login_password":self.prefs['password']
}
data = urllib.urlencode(parameters)
self.log(url)
self.log(data)
clean = clean_ascii_chars(br.open(url,data).read().strip())
parser = etree.HTMLParser(recover=True)
feed = fromstring(clean, parser=parser)
self.log(clean)
return len(feed.xpath('//input[@id="login_name"]/@name')) == 0
except Exception as e:
self.log.exception(e)
return False
def insert_todos(plan_fn, todos):
info("insert_todos")
html_parser = etree.HTMLParser(
remove_comments=True, remove_blank_text=True
)
doc = etree.parse(open(plan_fn, "rb"), html_parser)
div = doc.xpath('//div[@id="Ongoing-todos"]')[0]
parent = div.getparent()
parent.replace(div, todos)
doc.write(plan_fn)
try:
req = urllib2.Request(wcurl, None, txheaders)
fh = urllib2.urlopen(req)
fh.close()
req = urllib2.Request('http://webcitation.org/topframe.php')
fh = urllib2.urlopen(req)
data = fh.read()
fh.close()
except Exception as e:
raise HandlerError('Cannot request page', 404)
changes = []
try:
parser = etree.HTMLParser()
dom = etree.parse(StringIO.StringIO(data), parser)
except:
raise HandlerError('Cannot parse HTML')
opts = dom.xpath('//select[@name="id"]/option')
for o in opts:
fid = o.attrib['value']
date = o.text
if date.find('(failed)') > -1:
continue
changes.append(('http://webcitation.org/query?id=' + fid, date))
return changes
def sanitise_html(dirty_html, opener=None, device=None):
html = etree.fromstring("<div>%s</div>" % dirty_html, parser = etree.HTMLParser())
html = transform(html, 'external_media/html_sanitiser.xslt')
#if True or device:
# for element in html.findall(".//img[@externalmedia]"):
# print element
return etree.tostring(html, method='html')[5:-6] # serialize and remove the div tag
def list_courses(self):
'''
List courses available in Studio site
'''
self.ensure_studio_site()
url = "%s/home/" % self.BASE
ret = self.ses.get(url)
parser = etree.HTMLParser()
xml = etree.parse(StringIO(ret.content), parser).getroot()
courses = []
course_ids = []
for course in xml.findall('.//li[@class="course-item"]'):
cid = course.get("data-course-key")
if self.verbose:
print cid # etree.tostring(course)
courses.append(course)
course_ids.append(cid)
return {'xml': courses,
'course_ids': course_ids,
}