Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_no_start_url_spider():
try:
class NoStartUrlSpider(Spider):
pass
NoStartUrlSpider.start()
except Exception as e:
assert isinstance(e, ValueError)
def test_callback_error():
class NoParseSpider(Spider):
start_urls = ["https://httpbin.org/get"]
NoParseSpider.start()
class CallbackError(Spider):
start_urls = ["https://httpbin.org/get"]
async def parse(self, response):
raise ValueError("error")
CallbackError.start()
def test_spider_hook_error():
class SpiderDemo(Spider):
start_urls = ["https://httpbin.org/get?p=0"]
async def parse(self, response):
pass
async def before_stop_func(spider_ins):
raise TypeError("error")
loop = asyncio.new_event_loop()
SpiderDemo.start(loop=loop, before_stop=before_stop_func)
def test_spider_hook():
async def after_start_func(spider_ins):
print("after_start_func")
spider_ins.result["after_start"] = True
assert isinstance(spider_ins.result, dict)
async def before_stop_func(spider_ins):
print("before_stop_func")
spider_ins.result["before_stop"] = True
class SpiderHook(Spider):
start_urls = ["https://httpbin.org/get?p=0", "https://httpbin.org/404"]
request_config = {"RETRIES": 1, "DELAY": 0, "TIMEOUT": 10}
result = {
"after_start": False,
"before_stop": False,
"process_succeed_response": False,
"process_failed_response": False,
"process_item": False,
}
async def parse(self, response):
item = await ItemDemo.get_item(html=HTML)
yield item
async def process_item(self, item):
async def clean_ranking_title(self, ranking_title):
if isinstance(ranking_title, list):
return ranking_title[0].text
else:
return str(ranking_title).split('榜')[0] + '榜'
async def clean_more(self, more):
return "https:" + more
class NameItem(Item):
top_name = TextField(css_select='h4', default='')
other_name = TextField(css_select='a.name', default='')
class QidianRankingSpider(Spider):
start_urls = [f"https://www.qidian.com/rank/?chn={key}" for key in
[-1, 21, 1, 2, 22, 4, 15, 6, 5, 7, 8, 9, 10, 12]]
concurrency = 3
qidian_type = {
'-1': '全部类别',
'21': '玄幻',
'1': '奇幻',
'2': '武侠',
'22': '仙侠',
'4': '都市',
'15': '职场',
'6': '军事',
'5': '历史',
'7': '游戏',
'8': '体育',
else:
return cover.replace('http', 'https')
async def clean_novels_type(self, novels_type):
types_dict = {
'社会': '都市'
}
print(types_dict.get(str(novels_type).strip(), novels_type))
return types_dict.get(str(novels_type).strip(), novels_type)
async def clean_latest_chapter_time(self, latest_chapter_time):
return latest_chapter_time.replace(u'今天', str(time.strftime("%Y-%m-%d ", time.localtime()))).replace(u'昨日', str(
time.strftime("%Y-%m-%d ", time.localtime(time.time() - 24 * 60 * 60))))
class HYNovelInfoSpider(Spider):
start_urls = []
request_config = {
'RETRIES': 3,
'TIMEOUT': 10
}
async def parse(self, res):
self.motor_db = MotorBase(loop=self.loop).get_db()
item = await HYNovelInfoItem.get_item(html=res.html)
item_data = {
'novel_name': item.novel_name,
'author': item.author,
'cover': item.cover,
'abstract': item.abstract,
'status': item.status,
"""
import aiofiles
from ruia import AttrField, TextField, Item, Spider
class HackerNewsItem(Item):
target_item = TextField(css_select="tr.athing")
title = TextField(css_select="a.storylink")
url = AttrField(css_select="a.storylink", attr="href")
async def clean_title(self, value):
return value.strip()
class HackerNewsSpider(Spider):
start_urls = [
"https://news.ycombinator.com/news?p=1",
"https://news.ycombinator.com/news?p=2",
]
concurrency = 10
async def parse(self, response):
async for item in HackerNewsItem.get_items(html=response.html):
yield item
async def process_item(self, item: HackerNewsItem):
async with aiofiles.open("./hacker_news.txt", "a") as f:
await f.write(str(item.title) + "\n")
if __name__ == "__main__":
#!/usr/bin/env python
from ruia import Spider
async def retry_func(request):
request.request_config["TIMEOUT"] = 10
class RetryDemo(Spider):
start_urls = ["http://httpbin.org/get"]
request_config = {
"RETRIES": 3,
"DELAY": 0,
"TIMEOUT": 0.1,
"RETRY_FUNC": retry_func,
}
async def parse(self, response):
pages = ["http://httpbin.org/get?p=1", "http://httpbin.org/get?p=2"]
async for resp in self.multiple_request(pages):
yield self.parse_item(response=resp)
async def parse_item(self, response):
json_data = await response.json()
async def clean_novel_author(self, novel_author):
if novel_author:
if isinstance(novel_author, list):
novel_author = novel_author[0].text
return novel_author
else:
return ''
# def tal_novel_author_home_url(self, novel_author_home_url):
# if isinstance(novel_author_home_url, list):
# novel_author_home_url = novel_author_home_url[0].get('href').strip()
# return 'http:' + novel_author_home_url
class ZHNovelsSpider(Spider):
start_urls = ['http://book.zongheng.com/store/c0/c0/b9/u0/p1/v9/s9/t0/ALL.html']
request_config = {
'RETRIES': 8,
'DELAY': 0,
'TIMEOUT': 3
}
concurrency = 60
motor_db = MotorBase(loop=loop).get_db()
async def parse(self, res):
items_data = await ZHNovelsItem.get_items(html=res.html)
tasks = []
for item in items_data:
if item.novel_url:
res_dic = {
async def clean_novel_author(self, novel_author):
if isinstance(novel_author, list):
novel_author = novel_author[0].text
return novel_author
async def clean_novel_author_home_url(self, novel_author_home_url):
if isinstance(novel_author_home_url, list):
novel_author_home_url = novel_author_home_url[0].get('href').strip()
return 'https:' + novel_author_home_url
async def clean_novel_cover(self, novel_cover):
return 'https:' + novel_cover
class QidianNovelsSpider(Spider):
# start_urls = ['https://www.qidian.com/all?page=1']
request_config = {
'RETRIES': 15,
'DELAY': 0,
'TIMEOUT': 3
}
concurrency = 20
motor_db = MotorBase(loop=loop).get_db()
async def parse(self, res):
items_data = await QidianNovelsItem.get_items(html=res.html)
tasks = []
for item in items_data:
res_dic = {
'novel_url': item.novel_url,