Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
parsers = {"html":lambda x:html5lib.parse(x, treebuilder="etree"),
"xhtml":lambda x:ElementTree.parse(x, XMLParser.XMLParser()),
reconstructedsentence.append(tail[int(wordstandofflimits[0]):int(wordstandofflimits[1])+1])
else:
reconstructedsentence[-1] = reconstructedsentence[-1] + tail[int(wordstandofflimits[0]):int(wordstandofflimits[1])+1]
break
return " ".join(reconstructedsentence)
#Argument input: path of original Bitextor formatted crawl file
document_standoff = dict()
with open(sys.argv[1],'r') as reader:
for line in reader:
fields=line.split('\t')
fields = list(map(str.strip, fields)) #Strip all elements
#We use lxml treebuilder because of getelementpath function and iteration through elements
document_standoff[fields[1]] = html5lib.parse(base64.b64decode(fields[0]),treebuilder="lxml",namespaceHTMLElements=False) #Store url:html5lib_tree for easy path search
#Input: Bitextor DOCALG file (deferred):
#url1 url2 deferred_clean_text1_in_base64 deferred_clean_text2_in_base64
#Output: Bitextor DOCALG file reconstructed:
#url1 url2 clean_text1_in_base64 clean_text2_in_base64
for line in sys.stdin:
fields = line.split('\t')
newfields = [fields[0],fields[1]]
for annotation,url in {fields[2]:fields[0],fields[3]:fields[1]}.items(): #SL and TL annotations with URLs from input DOCALG file format: https://github.com/bitextor/bitextor/wiki/Intermediate-formats-used-in-Bitextor#docalg
if annotation != "":
newfields.append(get_sentence(annotation,document_standoff[url]))
else:
newfields.append("")
print("\t".join(newfields))
msg = 'Amazon timed out. Try again later.'
self.log.error(msg)
else:
msg = 'Failed to make details query: %r'%self.url
self.log.exception(msg)
return
oraw = raw
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
if '<title>404 - ' in raw:
self.log.error('URL malformed: %r'%self.url)
return
try:
root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml',
namespaceHTMLElements=False)
except:
msg = 'Failed to parse amazon details page: %r'%self.url
self.log.exception(msg)
return
errmsg = root.xpath('//*[@id="errorMessage"]')
if errmsg:
msg = 'Failed to parse amazon details page: %r'%self.url
msg += self.tostring(errmsg, method='text', encoding=unicode).strip()
self.log.error(msg)
return
self.parse_details(oraw, root)
</title>
session = requests.session()
session.verify = False
for package in packages:
print("")
print("Download candidates for %s" % package)
print("========================" + ("=" * len(package)))
# Grab the page from PyPI
url = "https://pypi.python.org/simple/%s/" % package
resp = session.get(url)
if resp.status_code == 404:
continue
resp.raise_for_status()
html = html5lib.parse(resp.content, namespaceHTMLElements=False)
spider = set()
installable_ = set()
for link in itertools.chain(
html.findall(".//a[@rel='download']"),
html.findall(".//a[@rel='homepage']")):
if "href" in link.attrib:
try:
absolute_link = urlparse.urljoin(url, link.attrib["href"])
except Exception:
continue
if not installable(package, absolute_link):
parsed = urlparse.urlparse(absolute_link)
if parsed.scheme.lower() in ["http", "https"]:
def _get_html(self):
url_opener = URLOpener()
response = url_opener.open(self.url)
if lxml:
html = lxml.html.parse(response).getroot()
else:
html = html5lib.parse(response, namespaceHTMLElements=False)
return html
def fetch(package):
print(" Download candidates for {}".format(package["name"]))
# GET the page from the mirror
url = url_template.format(package["name"])
resp = session.get(url)
if resp.status_code != 200:
print(
" Cannot fetch candidates: error {}".format(resp.status_code)
)
# Parse HTML content
html = html5lib.parse(resp.content, namespaceHTMLElements=False)
# Iterate all the provided downloads
for link in html.findall(".//a"):
package["candidates"].append((link.text, link.attrib["href"]))
def extract_table_row(html):
doc = html5lib.parse(html)
return find_element_by_tag('tr', doc)
def find_iter(skeleton, document):
"""
Return an iterator that yields elements from the document that
match given skeleton.
See `find_all` for details.
"""
if is_string(document):
document = html5lib.parse(document)
if is_string(skeleton):
fragment = html5lib.parseFragment(skeleton)
if len(fragment) != 1:
raise ValueError("Skeleton must have exactly one root element.")
skeleton = fragment[0]
for element in document.iter():
if node_matches_bone(element, skeleton):
yield element
def bench_html5lib(html_file):
html_file.seek(0)
html5lib.parse(html_file)