说明
unsplash 网站比较慢, 只爬数据就用了30分钟, 爬了945页,每页有24条数据. 尝试下载图片,更加慢, 平均每张图片大约10M, 如果要下载完全部图片大约要 9452410/1024 = 222G, 还是算了吧.
爬虫代码
根据 aosabook/500lines 的爬虫修改而来, 尝试使用异步来写爬虫.
使用chrome浏览器开发工具来分析入口,在 network 里面的 xhr 就可以看到了, 还可以看到请求的头部,因为该网站需要 authorization 才可以爬取.
crawl.py1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23import asyncio
import crawling
def main():
loop = asyncio.get_event_loop()
crawler = crawling.Crawler()
try:
loop.run_until_complete(crawler.crawl())
except KeyboardInterrupt:
print('\nInterrupted\n')
finally:
crawler.report()
crawler.close()
loop.stop()
loop.run_forever()
loop.close()
if __name__ == '__main__':
main()
crawling.py1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112import aiohttp
import asyncio
import time
import urllib.parse
from db import insert_many
def is_redirect(response):
return response.status in (300, 301, 301, 303, 307)
class Crawler:
def __init__(self,
max_redirect=10,
max_tries=4,
max_tasks=10,
*,
loop=None):
self.max_redirect = max_redirect
self.max_tries = max_tries
self.max_tasks = max_tasks
self.loop = loop or asyncio.get_event_loop()
self.q = asyncio.Queue(loop=self.loop)
self.seen_urls = set()
self.session = aiohttp.ClientSession(loop=self.loop)
self.headers = {
'authorization':
'Client-ID d69927c7ea5c770fa2ce9a2f1e3589bd896454f7068f689d8e41a25b54fa6042'
}
self.base_url = 'https://unsplash.com/napi/photos?per_page=24&order_by=latest&page=%s'
self.page = 1
self.add_url(self.base_url % (1))
self.t0 = None
self.t1 = None
def add_url(self, url, max_redirect=None):
if max_redirect is None:
max_redirect = self.max_redirect
self.seen_urls.add(url)
self.q.put_nowait((url, max_redirect))
async def crawl(self):
workers = [
asyncio.Task(self.work(), loop=self.loop)
for _ in range(self.max_tasks)
]
self.t0 = time.time()
await self.q.join()
self.t1 = time.time()
for w in workers:
w.cancel()
def report(self):
print('total time: ', self.t1 - self.t0)
print('total page:', self.page)
async def work(self):
try:
while True:
url, max_redirect = await self.q.get()
await self.fetch(url, max_redirect)
self.q.task_done()
except asyncio.CancelledError:
pass
async def fetch(self, url, max_redirect):
tries = 0
while tries < self.max_tries:
try:
response = await self.session.get(
url, headers=self.headers, allow_redirects=False)
break
except aiohttp.ClientError:
pass
tries += 1
else:
# all tries failed.
return
try:
if is_redirect(response):
location = response.headers['location']
next_url = urllib.parse.urljoin(url, location)
if next_url in self.seen_urls:
return
if max_redirect > 0:
self.add_url(next_url, max_redirect - 1)
else:
links = await self.parse_links(response)
for link in links.difference(self.seen_urls):
self.q.put_nowait((link, self.max_redirect))
self.seen_urls.update(links)
finally:
await response.release()
async def parse_links(self, response):
links = set()
if response.status == 200:
data = await response.json()
if data:
insert_many(data)
self.page += 1
print('next', self.base_url % (self.page))
links.add(self.base_url % (self.page))
return links
def close(self):
self.session.close()
请求回来的数据比较乱,还是用 mongdb 存储比较方便
db.py1
2
3
4
5
6
7
8
9import pymongo
client = pymongo.MongoClient()
db = client.unsplash
photo = db.photo
def insert_many(photos):
photo.insert_many(photos)
数据分析
提取数据
1 | import pymongo |
数据总量
len(photos)
22648
定义常用函数
1 | from collections import defaultdict |
top10 图片大小
1 | width_heights = [str(p['width'])+'_'+str(p['height']) for p in photos] |
[(12761, '2016'),
(5467, '2015'),
(2702, '2017'),
(1498, '2014'),
(220, '2013')]