Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
return search_filter
def feed(self, keyword, offset, max_num, filters=None):
base_url = ('http://image.baidu.com/search/acjson?tn=resultjson_com'
'&ipn=rj&word={}&pn={}&rn=30')
self.filter = self.get_filter()
filter_str = self.filter.apply(filters, sep='&')
for i in range(offset, offset + max_num, 30):
url = base_url.format(keyword, i)
if filter_str:
url += '&' + filter_str
self.out_queue.put(url)
self.logger.debug('put url to url_queue: {}'.format(url))
class BaiduParser(Parser):
def _decode_url(self, encrypted_url):
url = encrypted_url
map1 = {'_z2C$q': ':', '_z&e3B': '.', 'AzdH3F': '/'}
map2 = {
'w': 'a', 'k': 'b', 'v': 'c', '1': 'd', 'j': 'e',
'u': 'f', '2': 'g', 'i': 'h', 't': 'i', '3': 'j',
'h': 'k', 's': 'l', '4': 'm', 'g': 'n', '5': 'o',
'r': 'p', 'q': 'q', '6': 'r', 'f': 's', 'p': 't',
'7': 'u', 'e': 'v', 'o': 'w', '8': '1', 'd': '2',
'n': '3', '9': '4', 'c': '5', 'm': '6', '0': '7',
'b': '8', 'l': '9', 'a': '0'
} # yapf: disable
for (ciphertext, plaintext) in map1.items():
url = url.replace(ciphertext, plaintext)
char_list = [char for char in url]
def __init__(self,
feeder_cls=Feeder,
parser_cls=Parser,
downloader_cls=Downloader,
feeder_threads=1,
parser_threads=1,
downloader_threads=1,
storage={
'backend': 'FileSystem',
'root_dir': 'images'
},
log_level=logging.INFO,
extra_feeder_args=None,
extra_parser_args=None,
extra_downloader_args=None):
"""Init components with class names and other arguments.
Args:
feeder_cls: class of feeder
# -*- coding: utf-8 -*-
import re
from bs4 import BeautifulSoup
from six.moves import html_parser
from icrawler import Crawler, Parser, SimpleSEFeeder, ImageDownloader
class BingParser(Parser):
def parse(self, response):
soup = BeautifulSoup(response.content, 'lxml')
image_divs = soup.find_all('div', class_='imgpt')
pattern = re.compile(r'murl\":\"(.*?)\.jpg')
for div in image_divs:
href_str = html_parser.HTMLParser().unescape(div.a['m'])
match = pattern.search(href_str)
if match:
img_url = '{}.jpg'.format(match.group(1))
yield dict(file_url=img_url)
class BingImageCrawler(Crawler):
def __init__(self,
return search_filter
def feed(self, keyword, offset, max_num, filters=None):
base_url = 'https://www.bing.com/images/async?q={}&first={}'
self.filter = self.get_filter()
filter_str = self.filter.apply(filters)
filter_str = '&qft=' + filter_str if filter_str else ''
for i in range(offset, offset + max_num, 20):
url = base_url.format(keyword, i) + filter_str
self.out_queue.put(url)
self.logger.debug('put url to url_queue: {}'.format(url))
class BingParser(Parser):
def parse(self, response):
soup = BeautifulSoup(
response.content.decode('utf-8', 'ignore'), 'lxml')
image_divs = soup.find_all('div', class_='imgpt')
pattern = re.compile(r'murl\":\"(.*?)\.jpg')
for div in image_divs:
href_str = html_parser.HTMLParser().unescape(div.a['m'])
match = pattern.search(href_str)
if match:
name = (match.group(1)
if six.PY3 else match.group(1).encode('utf-8'))
img_url = '{}.jpg'.format(name)
yield dict(file_url=img_url)
class GoogleFeeder(Feeder):
def feed(self, keyword, offset, max_num, date_min, date_max):
base_url = 'https://www.google.com/search?site=imghp&tbm=isch&source=hp&'
for i in range(offset, offset + max_num, 100):
cd_min = date_min.strftime('%d/%m/%Y') if date_min else ''
cd_max = date_max.strftime('%d/%m/%Y') if date_max else ''
tbs = 'cdr:1,cd_min:{},cd_max:{}'.format(cd_min, cd_max)
params = dict(
q=keyword, ijn=int(i / 100), start=i, tbs=tbs, tbm='isch')
url = base_url + urlencode(params)
self.out_queue.put(url)
self.logger.debug('put url to url_queue: {}'.format(url))
class GoogleParser(Parser):
def parse(self, response):
soup = BeautifulSoup(response.content, 'lxml')
image_divs = soup.find_all('div', class_='rg_meta')
for div in image_divs:
meta = json.loads(div.text)
if 'ou' in meta:
yield dict(file_url=meta['ou'])
class GoogleImageCrawler(Crawler):
def __init__(self,
feeder_cls=GoogleFeeder,
parser_cls=GoogleParser,
downloader_cls=ImageDownloader,
filter_str = self.filter.apply(filters, sep=',')
for i in range(offset, offset + max_num, 100):
params = dict(
q=keyword,
ijn=int(i / 100),
start=i,
tbs=filter_str,
tbm='isch')
if language:
params['lr'] = 'lang_' + language
url = base_url + urlencode(params)
self.out_queue.put(url)
self.logger.debug('put url to url_queue: {}'.format(url))
class GoogleParser(Parser):
def parse(self, response):
soup = BeautifulSoup(
response.content.decode('utf-8', 'ignore'), 'lxml')
image_divs = soup.find_all('div', class_='rg_meta')
for div in image_divs:
meta = json.loads(div.text)
if 'ou' in meta:
yield dict(file_url=meta['ou'])
class GoogleImageCrawler(Crawler):
def __init__(self,
feeder_cls=GoogleFeeder,
parser_cls=GoogleParser,
for i in range(page, page + page_max):
if self.signal.get('reach_max_num'):
break
complete_url = '{}&page={}'.format(url, i)
while True:
try:
self.output(complete_url, block=False)
except:
if self.signal.get('reach_max_num'):
break
else:
break
self.logger.debug('put url to url_queue: {}'.format(complete_url))
class FlickrParser(Parser):
def parse(self, response, apikey, size_preference=None):
content = json.loads(response.content.decode('utf-8', 'ignore'))
if content['stat'] != 'ok':
return
photos = content['photos']['photo']
for photo in photos:
photo_id = photo['id']
base_url = 'https://api.flickr.com/services/rest/?'
params = {
'method': 'flickr.photos.getSizes',
'api_key': apikey,
'photo_id': photo_id,
'format': 'json',
'nojsoncallback': 1
}
import threading
from six.moves import queue
from icrawler import Crawler, Parser, UrlListFeeder, ImageDownloader
class PseudoParser(Parser):
def worker_exec(self, queue_timeout=2, **kwargs):
while True:
if self.signal.get('reach_max_num'):
self.logger.info('downloaded image reached max num, thread %s'
' exit',
threading.current_thread().name)
break
try:
url = self.in_queue.get(timeout=queue_timeout)
except queue.Empty:
if self.signal.get('feeder_exited'):
self.logger.info('no more page urls to parse, thread %s'
' exit',
threading.current_thread().name)
break
from bs4 import BeautifulSoup
from six.moves.urllib.parse import urljoin, urlsplit
from icrawler import Crawler, Feeder, Parser, ImageDownloader
class GreedyFeeder(Feeder):
def feed(self, domains):
for domain in domains:
self.output(domain)
while not self.signal.get('reach_max_num'):
time.sleep(1)
class GreedyParser(Parser):
def __init__(self, *args, **kwargs):
self.pattern = re.compile(
r'(http|\/\/)(.*)\.(jpg|jpeg|png|bmp|gif|tiff)')
super(GreedyParser, self).__init__(*args, **kwargs)
def is_in_domain(self, url, domains):
for domain in domains:
if domain in url:
return True
return False
def parse(self, response, domains):
soup = BeautifulSoup(
response.content.decode('utf-8', 'ignore'), 'lxml')
tags = soup.find_all('img', src=True)