Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
class DoubanItem(Item):
title = TextField(css_select="head title")
constant_attr = "hello ruia"
async def clean_title(self, title):
return "Title: " + title
class DoubanCleanMethodErrorItem(Item):
title = TextField(css_select="head title")
def clean_title(self, title):
return "Title: " + title
class DoubanIgnoreItem(Item):
title = TextField(css_select="head title")
async def clean_title(self, title):
raise IgnoreThisItem
class HackerNewsItem(Item):
target_item = TextField(css_select="tr.athing")
title = TextField(css_select="a.storylink")
url = AttrField(css_select="a.storylink", attr="href")
async def parse_item(html):
items = []
async for item in DoubanItems.get_items(html=html):
items.append(item)
#!/usr/bin/env python
import asyncio
import os
from ruia import AttrField, Item, TextField
from ruia.exceptions import IgnoreThisItem, InvalidFuncType
html_path = os.path.join(
os.path.dirname(os.path.realpath(__file__)), "data", "for_item_testing.html"
)
with open(html_path, mode="r", encoding="utf-8") as file:
HTML = file.read()
class DoubanItems(Item):
target_item = TextField(css_select="div.item")
title = TextField(css_select="span.title")
cover = AttrField(css_select="div.pic>a>img", attr="src")
abstract = TextField(css_select="span.inq")
async def clean_title(self, title):
if isinstance(title, str):
return title
else:
return "".join([i.text.strip().replace("\xa0", "") for i in title])
class DoubanItem(Item):
title = TextField(css_select="head title")
constant_attr = "hello ruia"
async def retry_func(request):
request.request_config["TIMEOUT"] = 10
@middleware.request
async def print_on_request(spider_ins, request):
request.headers = {"User-Agent": "ruia ua"}
@middleware.response
async def print_on_response(spider_ins, request, response):
assert isinstance(response.html, str)
assert request.headers == {"User-Agent": "ruia ua"}
class ItemDemo(Item):
title = TextField(xpath_select="/html/head/title")
class SpiderDemo(Spider):
start_urls = ["https://httpbin.org/get?p=0"]
request_config = {"RETRIES": 3, "DELAY": 0, "TIMEOUT": 20}
headers = {"User-Agent": "Ruia Spider"}
call_nums = 0
async def parse(self, response):
yield Request(
url=response.url,
callback=self.parse_item,
headers=self.headers,
request_config=self.request_config,
#!/usr/bin/env python
import asyncio
import time
from ruia import Spider, Item, AttrField, HtmlField, TextField
from ruia_ua import middleware
from owllook.database.mongodb import MotorBaseOld
class RankingItem(Item):
target_item = TextField(css_select='.rank-list')
ranking_title = TextField(css_select='h3.wrap-title')
more = AttrField(css_select='h3>a.more', attr='href')
book_list = HtmlField(css_select='div.book-list>ul>li', many=True)
async def clean_ranking_title(self, ranking_title):
if isinstance(ranking_title, list):
return ranking_title[0].text
else:
return str(ranking_title).split('榜')[0] + '榜'
async def clean_more(self, more):
return "https:" + more
class NameItem(Item):
from owllook.database.mongodb import MotorBase
from owllook.spiders.middlewares import owl_middleware
try:
import uvloop
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
except ImportError:
pass
loop = asyncio.get_event_loop()
asyncio.set_event_loop(loop)
class QidianNovelsItem(Item):
target_item = TextField(css_select='ul.all-img-list>li')
novel_url = AttrField(css_select='div.book-img-box>a', attr='href')
novel_name = TextField(css_select='div.book-mid-info>h4')
novel_author = TextField(css_select='div.book-mid-info>p.author>a.name')
novel_author_home_url = AttrField(css_select='div.book-mid-info>p.author>a.name', attr='href')
novel_type = TextField(css_select='div.book-mid-info > p.author > a:nth-child(4)')
novel_cover = AttrField(css_select='div.book-img-box img', attr='src')
novel_abstract = TextField(css_select='div.book-mid-info p.intro')
# novel_latest_chapter = TextField(css_select='div.bookupdate a')
async def clean_novel_url(self, novel_url):
return 'https:' + novel_url
async def clean_novel_author(self, novel_author):
if isinstance(novel_author, list):
#!/usr/bin/env python
"""
Created by howie.hu at 2018/9/8.
"""
from ruia import AttrField, TextField, Item
from ruia_pyppeteer import PyppeteerSpider as Spider
class JianshuItem(Item):
target_item = TextField(css_select="ul.list>li")
author_name = TextField(css_select="a.name")
author_url = AttrField(attr="href", css_select="a.name")
async def clean_author_name(selfself, author_name):
return author_name.strip()
async def clean_author_url(self, author_url):
return f"https://www.jianshu.com{author_url}"
class JianshuSpider(Spider):
start_urls = ["https://www.jianshu.com/"]
concurrency = 10
async def parse(self, response):
#!/usr/bin/env python
"""
Target: https://news.ycombinator.com/
pip install aiofiles
"""
import aiofiles
from ruia import AttrField, TextField, Item, Spider
class HackerNewsItem(Item):
target_item = TextField(css_select="tr.athing")
title = TextField(css_select="a.storylink")
url = AttrField(css_select="a.storylink", attr="href")
async def clean_title(self, value):
return value.strip()
class HackerNewsSpider(Spider):
start_urls = [
"https://news.ycombinator.com/news?p=1",
"https://news.ycombinator.com/news?p=2",
]
concurrency = 10
async def parse(self, response):
import asyncio
import sys
from ruia import Item, TextField, AttrField
class PythonDocumentationItem(Item):
title = TextField(css_select="title")
tutorial_link = AttrField(xpath_select="//a[text()='Tutorial']", attr="href")
async def field_extraction():
url = "https://docs.python.org/3/"
item = await PythonDocumentationItem.get_item(url=url)
print(item.title)
print(item.tutorial_link)
if __name__ == "__main__":
if sys.version_info[:2] == (3, 7):
# Recommended for Python 3.7
asyncio.run(field_extraction())
else:
import time
from ruia import Spider, Item, AttrField, HtmlField, TextField
from ruia_ua import middleware
from owllook.database.mongodb import MotorBaseOld
class RankingItem(Item):
target_item = TextField(css_select='div.rank_i_p_list')
ranking_title = TextField(css_select='div.rank_i_p_tit')
more = AttrField(css_select='div.rank_i_more a', attr='href')
book_list = HtmlField(css_select='div.rank_i_p_list>div.rank_i_li', many=True)
class NameItem(Item):
top_name = TextField(css_select='div.rank_i_bname a.rank_i_l_a_book', default='')
other_name = TextField(css_select='div.rank_i_bname a', default='')
class ZHRankingSpider(Spider):
start_urls = ['http://book.zongheng.com/rank.html']
concurrency = 3
async def parse(self, res):
result = []
res_dic = {}
async for item in RankingItem.get_items(html=res.html):
each_book_list = []
# 只取排名前十的书籍数据
#!/usr/bin/env python
from ruia import AttrField, Item, Spider, TextField
class DoubanItem(Item):
target_item = TextField(css_select="div.item")
title = TextField(css_select="span.title")
cover = AttrField(css_select="div.pic>a>img", attr="src")
abstract = TextField(css_select="span.inq", default="")
async def clean_title(self, title):
if isinstance(title, str):
return title
else:
return "".join([i.text.strip().replace("\xa0", "") for i in title])
class DoubanSpider(Spider):
start_urls = ["https://movie.douban.com/top250"]
request_config = {"RETRIES": 3, "DELAY": 0, "TIMEOUT": 20}
concurrency = 10