Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_attr_field():
title = ruia.AttrField(css_select=".title", attr="href", default="Untitled")
assert title.extract(html_etree=html) == "/"
tags = ruia.AttrField(css_select=".tag", attr="href", default="No tag", many=True)
assert tags.extract(html_etree=html)[0] == "./easy.html"
def test_attr_field():
title = ruia.AttrField(css_select=".title", attr="href", default="Untitled")
assert title.extract(html_etree=html) == "/"
tags = ruia.AttrField(css_select=".tag", attr="href", default="No tag", many=True)
assert tags.extract(html_etree=html)[0] == "./easy.html"
def clean_title(self, title):
return "Title: " + title
class DoubanIgnoreItem(Item):
title = TextField(css_select="head title")
async def clean_title(self, title):
raise IgnoreThisItem
class HackerNewsItem(Item):
target_item = TextField(css_select="tr.athing")
title = TextField(css_select="a.storylink")
url = AttrField(css_select="a.storylink", attr="href")
async def parse_item(html):
items = []
async for item in DoubanItems.get_items(html=html):
items.append(item)
return items
async def error_parse_item(html):
items = []
async for item in DoubanItem.get_items(html=html):
items.append(item)
return items
from pprint import pprint
from ruia import Spider, Item, TextField, AttrField
from ruia_ua import middleware as ua_middleware
from owllook.database.mongodb import MotorBase
class HYNovelInfoItem(Item):
"""
定义继承自item的Item类
"""
novel_name = AttrField(css_select="meta[property='og:title']", attr='content')
author = AttrField(css_select="meta[property='og:novel:author']", attr='content')
cover = AttrField(css_select="meta[property='og:image']", attr='content')
abstract = AttrField(css_select="meta[property='og:description']", attr='content')
status = AttrField(css_select="meta[property='og:novel:status']", attr='content')
novels_type = AttrField(css_select="meta[property='og:novel:category']", attr='content')
novel_chapter_url = AttrField(css_select='div#voteList a.index', attr='href')
latest_chapter = AttrField(css_select="meta[property='og:novel:latest_chapter_name']", attr='content')
latest_chapter_url = AttrField(css_select="meta[property='og:novel:latest_chapter_url']", attr='content')
latest_chapter_time = AttrField(css_select="meta[property='og:novel:update_time']", attr='content')
# novel_name = TextField(css_select='div.c-left>div.mod>div.hd>h2')
# author = TextField(css_select='div.author-zone div.right a.name strong')
# cover = AttrField(css_select='img.book-cover', attr='src')
# abstract = TextField(css_select='pre.note')
# status = ''
# novels_type = TextField(css_select='div.c-left>div.mod>div.hd>p.infos>span.cate>a')
# latest_chapter = ''
# novel_chapter_url = AttrField(css_select='div#voteList a.index', attr='href')
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
except ImportError:
pass
loop = asyncio.get_event_loop()
asyncio.set_event_loop(loop)
class ZHNovelsItem(Item):
target_item = TextField(css_select='div.store_collist div.bookbox')
novel_url = AttrField(css_select='div.bookinfo div.bookname a', attr='href')
novel_name = TextField(css_select='div.bookinfo div.bookname a')
novel_author = TextField(css_select='div.bookilnk a:nth-child(1)')
novel_author_home_url = AttrField(css_select='div.bookilnk a:nth-child(1)', attr='href')
novel_type = TextField(css_select='div.bookilnk a:nth-child(2)')
novel_cover = AttrField(css_select='div.bookimg img', attr='src')
novel_abstract = TextField(css_select='div.bookintro')
novel_latest_chapter = TextField(css_select='div.bookupdate a')
# def tal_novel_url(self, novel_url):
# return 'http:' + novel_url
async def clean_novel_author(self, novel_author):
if novel_author:
if isinstance(novel_author, list):
novel_author = novel_author[0].text
return novel_author
else:
return ''
# def tal_novel_author_home_url(self, novel_author_home_url):
# if isinstance(novel_author_home_url, list):
from owllook.database.mongodb import MotorBase
class HYNovelInfoItem(Item):
"""
定义继承自item的Item类
"""
novel_name = AttrField(css_select="meta[property='og:title']", attr='content')
author = AttrField(css_select="meta[property='og:novel:author']", attr='content')
cover = AttrField(css_select="meta[property='og:image']", attr='content')
abstract = AttrField(css_select="meta[property='og:description']", attr='content')
status = AttrField(css_select="meta[property='og:novel:status']", attr='content')
novels_type = AttrField(css_select="meta[property='og:novel:category']", attr='content')
novel_chapter_url = AttrField(css_select='div#voteList a.index', attr='href')
latest_chapter = AttrField(css_select="meta[property='og:novel:latest_chapter_name']", attr='content')
latest_chapter_url = AttrField(css_select="meta[property='og:novel:latest_chapter_url']", attr='content')
latest_chapter_time = AttrField(css_select="meta[property='og:novel:update_time']", attr='content')
# novel_name = TextField(css_select='div.c-left>div.mod>div.hd>h2')
# author = TextField(css_select='div.author-zone div.right a.name strong')
# cover = AttrField(css_select='img.book-cover', attr='src')
# abstract = TextField(css_select='pre.note')
# status = ''
# novels_type = TextField(css_select='div.c-left>div.mod>div.hd>p.infos>span.cate>a')
# latest_chapter = ''
# novel_chapter_url = AttrField(css_select='div#voteList a.index', attr='href')
async def clean_cover(self, cover):
if 'https' in cover:
return cover
else:
class HYNovelInfoItem(Item):
"""
定义继承自item的Item类
"""
novel_name = AttrField(css_select="meta[property='og:title']", attr='content')
author = AttrField(css_select="meta[property='og:novel:author']", attr='content')
cover = AttrField(css_select="meta[property='og:image']", attr='content')
abstract = AttrField(css_select="meta[property='og:description']", attr='content')
status = AttrField(css_select="meta[property='og:novel:status']", attr='content')
novels_type = AttrField(css_select="meta[property='og:novel:category']", attr='content')
novel_chapter_url = AttrField(css_select='div#voteList a.index', attr='href')
latest_chapter = AttrField(css_select="meta[property='og:novel:latest_chapter_name']", attr='content')
latest_chapter_url = AttrField(css_select="meta[property='og:novel:latest_chapter_url']", attr='content')
latest_chapter_time = AttrField(css_select="meta[property='og:novel:update_time']", attr='content')
# novel_name = TextField(css_select='div.c-left>div.mod>div.hd>h2')
# author = TextField(css_select='div.author-zone div.right a.name strong')
# cover = AttrField(css_select='img.book-cover', attr='src')
# abstract = TextField(css_select='pre.note')
# status = ''
# novels_type = TextField(css_select='div.c-left>div.mod>div.hd>p.infos>span.cate>a')
# latest_chapter = ''
# novel_chapter_url = AttrField(css_select='div#voteList a.index', attr='href')
async def clean_cover(self, cover):
if 'https' in cover:
return cover
else:
return cover.replace('http', 'https')
#!/usr/bin/env python
"""
Created by howie.hu at 29/11/2017.
"""
import time
from ruia import Spider, Item, AttrField, HtmlField, TextField
from ruia_ua import middleware
from owllook.database.mongodb import MotorBaseOld
class RankingItem(Item):
target_item = TextField(css_select='div.rank_i_p_list')
ranking_title = TextField(css_select='div.rank_i_p_tit')
more = AttrField(css_select='div.rank_i_more a', attr='href')
book_list = HtmlField(css_select='div.rank_i_p_list>div.rank_i_li', many=True)
class NameItem(Item):
top_name = TextField(css_select='div.rank_i_bname a.rank_i_l_a_book', default='')
other_name = TextField(css_select='div.rank_i_bname a', default='')
class ZHRankingSpider(Spider):
start_urls = ['http://book.zongheng.com/rank.html']
concurrency = 3
async def parse(self, res):
result = []
res_dic = {}
from owllook.spiders.middlewares import owl_middleware
try:
import uvloop
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
except ImportError:
pass
loop = asyncio.get_event_loop()
asyncio.set_event_loop(loop)
class QidianNovelsItem(Item):
target_item = TextField(css_select='ul.all-img-list>li')
novel_url = AttrField(css_select='div.book-img-box>a', attr='href')
novel_name = TextField(css_select='div.book-mid-info>h4')
novel_author = TextField(css_select='div.book-mid-info>p.author>a.name')
novel_author_home_url = AttrField(css_select='div.book-mid-info>p.author>a.name', attr='href')
novel_type = TextField(css_select='div.book-mid-info > p.author > a:nth-child(4)')
novel_cover = AttrField(css_select='div.book-img-box img', attr='src')
novel_abstract = TextField(css_select='div.book-mid-info p.intro')
# novel_latest_chapter = TextField(css_select='div.bookupdate a')
async def clean_novel_url(self, novel_url):
return 'https:' + novel_url
async def clean_novel_author(self, novel_author):
if isinstance(novel_author, list):
novel_author = novel_author[0].text
return novel_author
import uvloop
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
except ImportError:
pass
loop = asyncio.get_event_loop()
asyncio.set_event_loop(loop)
class QidianNovelsItem(Item):
target_item = TextField(css_select='ul.all-img-list>li')
novel_url = AttrField(css_select='div.book-img-box>a', attr='href')
novel_name = TextField(css_select='div.book-mid-info>h4')
novel_author = TextField(css_select='div.book-mid-info>p.author>a.name')
novel_author_home_url = AttrField(css_select='div.book-mid-info>p.author>a.name', attr='href')
novel_type = TextField(css_select='div.book-mid-info > p.author > a:nth-child(4)')
novel_cover = AttrField(css_select='div.book-img-box img', attr='src')
novel_abstract = TextField(css_select='div.book-mid-info p.intro')
# novel_latest_chapter = TextField(css_select='div.bookupdate a')
async def clean_novel_url(self, novel_url):
return 'https:' + novel_url
async def clean_novel_author(self, novel_author):
if isinstance(novel_author, list):
novel_author = novel_author[0].text
return novel_author
async def clean_novel_author_home_url(self, novel_author_home_url):
if isinstance(novel_author_home_url, list):