Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_html_field():
title = ruia.HtmlField(css_select=".title", default="Untitled")
assert (
title.extract(html_etree=html)
== '<div href="/" class="title">Ruia Documentation</div>\n'
)
tags = ruia.HtmlField(css_select=".tag", default="No tag", many=True)
assert (
tags.extract(html_etree=html)[1]
== '<li href="./fast.html" class="tag">fast</li>\n '
)
def test_html_field():
field_en = HtmlField(css_select="div.brand a")
field_zh = HtmlField(css_select="div.brand p")
assert (
field_en.extract(html_etree=html_etree)
== '<a href="https://github.com">Github</a>'
)
assert field_zh.extract(html_etree=html_etree) == "<p>你好</p>\n"
def test_html_field_with_many():
field = HtmlField(css_select="a.test_link", many=True)
values = field.extract(html_etree=html_etree)
assert len(values) == 5
assert (
values[0]
== '<a href="https://github.com/howie6879/" class="test_link">hello1 github.</a>\n'
)
assert (
values[4]
== '<a href="https://github.com/howie6879/" class="test_link">hello5 github.</a>\n'
" Some text outside.\n"
def test_html_field():
title = ruia.HtmlField(css_select=".title", default="Untitled")
assert (
title.extract(html_etree=html)
== '<div href="/" class="title">Ruia Documentation</div>\n'
)
tags = ruia.HtmlField(css_select=".tag", default="No tag", many=True)
assert (
tags.extract(html_etree=html)[1]
== '<li href="./fast.html" class="tag">fast</li>\n '
)
def test_html_field():
field_en = HtmlField(css_select="div.brand a")
field_zh = HtmlField(css_select="div.brand p")
assert (
field_en.extract(html_etree=html_etree)
== '<a href="https://github.com">Github</a>'
)
assert field_zh.extract(html_etree=html_etree) == "<p>你好</p>\n"
#!/usr/bin/env python
import asyncio
import time
from ruia import Spider, Item, AttrField, HtmlField, TextField
from ruia_ua import middleware
from owllook.database.mongodb import MotorBaseOld
class RankingItem(Item):
target_item = TextField(css_select='.rank-list')
ranking_title = TextField(css_select='h3.wrap-title')
more = AttrField(css_select='h3>a.more', attr='href')
book_list = HtmlField(css_select='div.book-list>ul>li', many=True)
async def clean_ranking_title(self, ranking_title):
if isinstance(ranking_title, list):
return ranking_title[0].text
else:
return str(ranking_title).split('榜')[0] + '榜'
async def clean_more(self, more):
return "https:" + more
class NameItem(Item):
top_name = TextField(css_select='h4', default='')
other_name = TextField(css_select='a.name', default='')
"""
Created by howie.hu at 29/11/2017.
"""
import time
from ruia import Spider, Item, AttrField, HtmlField, TextField
from ruia_ua import middleware
from owllook.database.mongodb import MotorBaseOld
class RankingItem(Item):
target_item = TextField(css_select='div.rank_i_p_list')
ranking_title = TextField(css_select='div.rank_i_p_tit')
more = AttrField(css_select='div.rank_i_more a', attr='href')
book_list = HtmlField(css_select='div.rank_i_p_list>div.rank_i_li', many=True)
class NameItem(Item):
top_name = TextField(css_select='div.rank_i_bname a.rank_i_l_a_book', default='')
other_name = TextField(css_select='div.rank_i_bname a', default='')
class ZHRankingSpider(Spider):
start_urls = ['http://book.zongheng.com/rank.html']
concurrency = 3
async def parse(self, res):
result = []
res_dic = {}