Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
:param is_async_start:
"""
if not self.start_urls or not isinstance(self.start_urls, collections.Iterable):
raise ValueError(
"Ruia spider must have a param named start_urls, eg: start_urls = ['https://www.github.com']")
# Init object-level properties
self.callback_result_map = self.callback_result_map or {}
self.request_config = self.request_config or {}
self.headers = self.headers or {}
self.metadata = self.metadata or {}
self.kwargs = self.kwargs or {}
self.request_config = self.request_config or {}
self.is_async_start = is_async_start
self.logger = get_logger(name='Spider')
self.loop = loop
asyncio.set_event_loop(self.loop)
# customize middleware
if isinstance(middleware, list):
self.middleware = reduce(lambda x, y: x + y, middleware)
else:
self.middleware = middleware or Middleware()
# async queue as a producer
self.request_queue = asyncio.Queue()
# semaphore, used for concurrency control
self.sem = asyncio.Semaphore(self.concurrency)
if self.method not in self.METHOD:
raise InvalidRequestMethod(f"{self.method} method is not supported")
self.callback = callback
self.encoding = encoding
self.headers = headers or {}
self.metadata = metadata or {}
self.request_session = request_session
self.request_config = (
self.REQUEST_CONFIG if request_config is None else request_config
)
self.ssl = aiohttp_kwargs.pop("ssl", False)
self.aiohttp_kwargs = aiohttp_kwargs
self.close_request_session = False
self.logger = get_logger(name=self.name)
self.retry_times = self.request_config.get("RETRIES", 3)
#!/usr/bin/env python
import os
from importlib import util
from ruia.utils import get_logger
logger = get_logger('settings')
class SettingsWrapper(object):
"""
SettingsWrapper returns a spider config
"""
def __init__(self, settings_name='settings.py'):
self.my_settings = {}
self.settings_name = settings_name
self._load_settings()
def __call__(self):
return self.my_settings
def settings(self):
try:
import uvloop
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
except ImportError:
pass
class SpiderHook:
"""
SpiderHook is used for extend spider
"""
callback_result_map: dict = None
logger = get_logger(name="Spider")
async def _run_spider_hook(self, hook_func):
"""
Run hook before/after spider start crawling
:param hook_func: aws function
:return:
"""
if callable(hook_func):
try:
aws_hook_func = hook_func(weakref.proxy(self))
if isawaitable(aws_hook_func):
await aws_hook_func
except Exception as e:
self.logger.error(f"