Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_css_select():
field = TextField(css_select="head title")
value = field.extract(html_etree=html_etree)
assert value == "ruia"
from ruia import Spider, Item, AttrField, HtmlField, TextField
from ruia_ua import middleware
from owllook.database.mongodb import MotorBaseOld
class RankingItem(Item):
target_item = TextField(css_select='div.rank_i_p_list')
ranking_title = TextField(css_select='div.rank_i_p_tit')
more = AttrField(css_select='div.rank_i_more a', attr='href')
book_list = HtmlField(css_select='div.rank_i_p_list>div.rank_i_li', many=True)
class NameItem(Item):
top_name = TextField(css_select='div.rank_i_bname a.rank_i_l_a_book', default='')
other_name = TextField(css_select='div.rank_i_bname a', default='')
class ZHRankingSpider(Spider):
start_urls = ['http://book.zongheng.com/rank.html']
concurrency = 3
async def parse(self, res):
result = []
res_dic = {}
async for item in RankingItem.get_items(html=res.html):
each_book_list = []
# 只取排名前十的书籍数据
for index, value in enumerate(item.book_list[:10]):
item_data = await NameItem.get_item(html=value)
#!/usr/bin/env python
from ruia import AttrField, TextField, Item
class HackerNewsItem(Item):
"""
定义目标字段抓取规则
"""
target_item = TextField(css_select="tr.athing")
title = TextField(css_select="a.storylink")
url = AttrField(css_select="a.storylink", attr="href")
async def clean_title(self, value):
"""
清洗目标数据
:param value: 初始目标数据
:return:
"""
return str(value).strip()
from ruia import TextField, Item, Spider
class HackerNewsItem(Item):
target_item = TextField(css_select='tr.athing')
title = TextField(css_select='a.storylink')
class HackerNewsSpider(Spider):
start_urls = ['https://news.ycombinator.com/news?p=1']
async def parse(self, response):
async for item in HackerNewsItem.get_items(html=response.html):
yield item
if __name__ == '__main__':
HackerNewsSpider.start()
#!/usr/bin/env python
"""
Target: https://news.ycombinator.com/
"""
import asyncio
from ruia import AttrField, TextField, Item
class HackerNewsItem(Item):
target_item = TextField(css_select="tr.athing")
title = TextField(css_select="a.storylink")
url = AttrField(css_select="a.storylink", attr="href")
async def clean_title(self, value):
return value.strip()
async def single_page_demo(url="https://news.ycombinator.com/"):
async for item in HackerNewsItem.get_items(url=url):
print(item)
async def multiple_page_demo():
pages = [
single_page_demo(f"https://news.ycombinator.com/news?p={page}")
for page in range(1, 3)
Created by howie.hu at 11/03/2018.
获取起点荣誉数据,如:https://book.qidian.com/honor/1009704712
荣誉类型:
- 推荐票
- 收藏
- 点击
"""
from ruia import Spider, Item, TextField
from ruia_ua import middleware
class QidianHonorItem(Item):
target_item = TextField(css_select='li.cf')
honor_text = TextField(css_select='span.decs')
honor_time = TextField(css_select='span.time')
class QidianHonorSpider(Spider):
start_urls = ['https://book.qidian.com/honor/1009531496']
request_config = {
'RETRIES': 3,
'DELAY': 0,
'TIMEOUT': 10
}
async def parse(self, res):
items_data = await QidianHonorItem.get_items(html=res.html)
click_list, col_list, rec_list, other_list = [], [], [], []
for item in items_data:
data = {
from owllook.database.mongodb import MotorBase
from owllook.spiders.middlewares import owl_middleware
try:
import uvloop
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
except ImportError:
pass
loop = asyncio.get_event_loop()
asyncio.set_event_loop(loop)
class ZHNovelsItem(Item):
target_item = TextField(css_select='div.store_collist div.bookbox')
novel_url = AttrField(css_select='div.bookinfo div.bookname a', attr='href')
novel_name = TextField(css_select='div.bookinfo div.bookname a')
novel_author = TextField(css_select='div.bookilnk a:nth-child(1)')
novel_author_home_url = AttrField(css_select='div.bookilnk a:nth-child(1)', attr='href')
novel_type = TextField(css_select='div.bookilnk a:nth-child(2)')
novel_cover = AttrField(css_select='div.bookimg img', attr='src')
novel_abstract = TextField(css_select='div.bookintro')
novel_latest_chapter = TextField(css_select='div.bookupdate a')
# def tal_novel_url(self, novel_url):
# return 'http:' + novel_url
async def clean_novel_author(self, novel_author):
if novel_author:
if isinstance(novel_author, list):
novel_author = novel_author[0].text
import asyncio
import aiofiles
from ruia import Item, TextField, AttrField, Spider
class HackerNewsItem(Item):
target_item = TextField(css_select='tr.athing')
title = TextField(css_select='a.storylink')
url = AttrField(css_select='a.storylink', attr='href')
class HackerNewsSpider(Spider):
concurrency = 2
start_urls = [f'https://news.ycombinator.com/news?p={index}' for index in range(10)]
async def parse(self, res):
items = await HackerNewsItem.get_items(html=res.html)
for item in items:
async with aiofiles.open('./hacker_news.txt', mode='a', encoding='utf-8') as f:
await f.write(item.title + '\n')
async def test_item():
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
except ImportError:
pass
loop = asyncio.get_event_loop()
asyncio.set_event_loop(loop)
class ZHNovelsItem(Item):
target_item = TextField(css_select='div.store_collist div.bookbox')
novel_url = AttrField(css_select='div.bookinfo div.bookname a', attr='href')
novel_name = TextField(css_select='div.bookinfo div.bookname a')
novel_author = TextField(css_select='div.bookilnk a:nth-child(1)')
novel_author_home_url = AttrField(css_select='div.bookilnk a:nth-child(1)', attr='href')
novel_type = TextField(css_select='div.bookilnk a:nth-child(2)')
novel_cover = AttrField(css_select='div.bookimg img', attr='src')
novel_abstract = TextField(css_select='div.bookintro')
novel_latest_chapter = TextField(css_select='div.bookupdate a')
# def tal_novel_url(self, novel_url):
# return 'http:' + novel_url
async def clean_novel_author(self, novel_author):
if novel_author:
if isinstance(novel_author, list):
novel_author = novel_author[0].text
return novel_author
else:
return ''
# def tal_novel_author_home_url(self, novel_author_home_url):
#!/usr/bin/env python
from ruia import AttrField, Item, Spider, TextField
class DoubanItem(Item):
target_item = TextField(css_select="div.item")
title = TextField(css_select="span.title")
cover = AttrField(css_select="div.pic>a>img", attr="src")
abstract = TextField(css_select="span.inq", default="")
async def clean_title(self, title):
if isinstance(title, str):
return title
else:
return "".join([i.text.strip().replace("\xa0", "") for i in title])
class DoubanSpider(Spider):
start_urls = ["https://movie.douban.com/top250"]
request_config = {"RETRIES": 3, "DELAY": 0, "TIMEOUT": 20}
concurrency = 10
# proxy config
# kwargs = {"proxy": "http://0.0.0.0:1087"}
kwargs = {}