Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_dont_redirect(self):
url = 'http://www.example.com/301'
url2 = 'http://www.example.com/redirected'
req = Request(url, meta={'dont_redirect': True})
rsp = Response(url, headers={'Location': url2}, status=301)
r = self.mw.process_response(req, rsp, self.spider)
assert isinstance(r, Response)
assert r is rsp
# Test that it redirects when dont_redirect is False
req = Request(url, meta={'dont_redirect': False})
rsp = Response(url2, status=200)
r = self.mw.process_response(req, rsp, self.spider)
assert isinstance(r, Response)
assert r is rsp
def test_results_are_cached_across_multiple_items(self):
rsp1 = Response('http://url1')
req1 = Request('http://url1', meta=dict(response=rsp1))
item = dict(requests=req1)
new_item = yield self.pipe.process_item(item, self.spider)
self.assertTrue(new_item is item)
self.assertEqual(new_item['results'], [(True, rsp1)])
# rsp2 is ignored, rsp1 must be in results because request fingerprints are the same
req2 = Request(req1.url, meta=dict(response=Response('http://donot.download.me')))
item = dict(requests=req2)
new_item = yield self.pipe.process_item(item, self.spider)
self.assertTrue(new_item is item)
self.assertEqual(request_fingerprint(req1), request_fingerprint(req2))
self.assertEqual(new_item['results'], [(True, rsp1)])
def test_hs_mware_process_spider_output_filter_request(hs_mware):
response = Response('http://resp-url')
# provide a response and a new request in result
child_response = Response('http://resp-url-child')
child_response.request = Request('http://resp-url-child-req')
child_request = Request('http://req-url-child')
hs_mware._seen = WeakKeyDictionary({response: 'riq'})
result = list(hs_mware.process_spider_output(
response, [child_response, child_request], Spider('test')))
assert len(result) == 2
# make sure that we update hsparent meta only for requests
assert result[0].meta.get(HS_PARENT_ID_KEY) is None
assert result[1].meta[HS_PARENT_ID_KEY] == 'riq'
assert mw.storage.retrieve_response(self.spider, req) is None
assert mw.process_request(req, self.spider) is None
# s3 scheme response is cached by default
req, res = Request('s3://bucket/key'), Response('http://bucket/key')
with self._middleware() as mw:
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)
cached = mw.process_request(req, self.spider)
assert isinstance(cached, Response), type(cached)
self.assertEqualResponse(res, cached)
assert 'cached' in cached.flags
# ignore s3 scheme
req, res = Request('s3://bucket/key2'), Response('http://bucket/key2')
with self._middleware(HTTPCACHE_IGNORE_SCHEMES=['s3']) as mw:
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)
assert mw.storage.retrieve_response(self.spider, req) is None
assert mw.process_request(req, self.spider) is None
def test_get_sitemap_body_xml_url_compressed(self):
r = Response(url="http://www.example.com/sitemap.xml.gz", body=self.GZBODY)
self.assertSitemapBody(r, self.BODY)
def _mock_crawlera_response(self, url, headers=None, **kwargs):
crawlera_headers = {"X-Crawlera-Version": "1.36.3-cd5e44"}
if headers:
crawlera_headers.update(headers)
return Response(url, headers=crawlera_headers, **kwargs)
def from_headers(self, headers):
"""Return the most appropiate Response class by looking at the HTTP
headers"""
cls = Response
if 'Content-Type' in headers:
cls = self.from_content_type(headers['Content-type'])
if cls is Response and 'Content-Disposition' in headers:
cls = self.from_content_disposition(headers['Content-Disposition'])
return cls
def __init__(self):
setattr(Response, 'getsoup', getsoup)
setattr(Response, 'soup', property(getsoup))
def default(self, o):
if isinstance(o, datetime.datetime):
return o.strftime("%s %s" % (self.DATE_FORMAT, self.TIME_FORMAT))
elif isinstance(o, datetime.date):
return o.strftime(self.DATE_FORMAT)
elif isinstance(o, datetime.time):
return o.strftime(self.TIME_FORMAT)
elif isinstance(o, decimal.Decimal):
return str(o)
elif isinstance(o, defer.Deferred):
return str(o)
elif isinstance(o, BaseItem):
return dict(o)
elif isinstance(o, Request):
return "<%s %s %s>" % (type(o).__name__, o.method, o.url)
elif isinstance(o, Response):
return "<%s %s %s>" % (type(o).__name__, o.status, o.url)
else:
return super(ScrapyJSONEncoder, self).default(o)