【学习记录】异步爬虫

。。。

jupyter中额外使用

1
2
import nest_asyncio
nest_asyncio.apply()

简易http请求例子

一些前面的东西

1
2
3
4
5
6
import aiohttp
import asyncio
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.63'
}

简易get 请求 例子

猜测其他请求可以根据request库类比

携程 text(encoding默认None, 内部自动处理) 返回 aiohttp.ClientResponse类型, 文档可查

1
2
3
4
5
6
7
async with aiohttp.ClientSession() as session:
async with session.request(
'GET',
'https://wldcmzy.github.io/',
headers = headers
) as res:
print(await res.text(encoding='utf-8'))

等效1

1
2
3
4
5
6
async with aiohttp.ClientSession() as session:
async with session.get(
'https://wldcmzy.github.io/',
headers = headers
) as res:
print(await res.text(encoding='utf-8'))

等效2

1
2
3
4
5
6
7
8
session = aiohttp.ClientSession()
res = await session.request(
'GET',
'https://wldcmzy.github.io/',
headers = headers
)
print(await res.text(encoding='utf-8'))
session.close()

aiohttp.ClientSession构造函数可传一些参数

1
2
3
4
5
6
async with aiohttp.ClientSession(headers = headers) as session:
async with session.get(
'https://wldcmzy.github.io/',
headers = headers
) as res:
print(await res.text(encoding='utf-8'))

异步爬虫简易实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import aiohttp
import asyncio
import time
urls = [
'https://wldcmzy.github.io/',
'https://docs.aiohttp.org/en/stable/index.html',
'https://github.com/Nearrin/HollowKnight.PureZote',
'https://www.baidu.com/',
]
async def getHTML(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as res:
print(time.time(), (await res.text())[ : 100], sep = '\n', end = '\n<<<<<<<<<<<<<\n')

loop = asyncio.get_event_loop()
tasks = [asyncio.ensure_future(getHTML(each)) for each in urls]
tasks = asyncio.gather(*tasks)
loop.run_until_complete(tasks)


限制并发数量 asyncio.Semaphore

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import aiohttp
import asyncio
import time

URLS = [
'https://wldcmzy.github.io/',
'https://docs.aiohttp.org/en/stable/index.html',
'https://www.baidu.com/',
'https://blog.csdn.net/mixintu/article/details/102424471',
]

async def getHTML(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as res:
return url, res.status

async def work(lst, semaphore):
async with semaphore:
url, sta = await getHTML(lst)
print(url, sta)

# async def main(loop: asyncio.AbstractEventLoop) -> None:
async def main() -> None:

semaphore = asyncio.Semaphore(2)
tasks = [work(each, semaphore) for each in URLS]
await asyncio.wait(tasks)
print('ok')

if __name__ == '__main__':
loop = asyncio.get_event_loop()
try:
st = time.time()
print('wait', st)
loop.run_until_complete(main())
print('over, usetime:', time.time() - st , 's')
st = time.time()
print('wait 2nd', st)
loop.run_until_complete(main())
print('over, usetime:', time.time() - st , 's')
finally:
# loop.close()
print('closed')


例子

爬漫画,貌似被思路限制了速度的异步爬虫

github: Mess-Mess/spider_didiaomh.com_async.py at master · Wldcmzy/Mess-Mess (github.com)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
from bs4 import BeautifulSoup
import re
from Wrappers import logger, remove_invalid_element_in_windows_path, try_except_ensure
from Wrappers.Affix import Affix_OnlyZeroExample
from Wrappers.HTMLwrapper import Image_HTMLwrapper_Lv2
import os
import aiohttp
import asyncio

class SpiderX:
DOMIN: str = 'https://www.didiaomh.com/'

def __init__(
self,
catalog_url: str,
outpath_name: str = 'out',
capture_format: str = 'capture{arg}',
image_format: str = 'image{arg}.jpg',
max_capture_number_length: int = 3,
max_page_number_length: int = 3,
work_span: tuple[int] = (0, 0),
concurrency: int = 5,
myproxy = None,
# headers: dict = {},
) -> None:
'''
catalog_url:
目录页链接
outpath_name:
输出路径名
capture_format:
章节格式
image_format:
图片格式
max_capture_number_length:
最大章节序号长度
max_page_number_length:
最大图片序号长度
work_span: tuple[int]:
爬取章节范围(小, 大)
默认爬取所有
concurrency:
同时进行的任务数量
proxies:
代理服务器信息
'''
self.catalog_url = catalog_url if catalog_url[ : len(self.DOMIN)] == self.DOMIN else self.DOMIN + catalog_url
self.outpath_name = outpath_name
self.check_path(self.outpath_name)
self.capture_format = capture_format
self.image_format = image_format
self.max_capture_number_length = max_capture_number_length
self.max_page_number_length = max_page_number_length
self.work_span = work_span
self.myproxy = myproxy

self.page_counter = 0
self.affix = Affix_OnlyZeroExample()

self.semaphore = asyncio.Semaphore(concurrency)
self.session = aiohttp.ClientSession()


def toreload_format_page_name(self, index: int):
return self.image_format.format(arg = self.affix.add_prefix(str(index), self.max_page_number_length))
def toreload_format_capture_name(self, index: int, capture_title: str):
return self.capture_format.format(arg = self.affix.add_prefix(str(index), self.max_page_number_length) + capture_title)

def check_path(self, path: str) -> None:
'''检查目录存在性,若目录没有, 则创建'''
if not os.path.exists(path):
os.mkdir(path)
logger.debug(f'建立目录:{path}...')

async def get_html(self, url: str) -> str:
'''抓取网页html信息'''
async with self.semaphore:
async with self.session.get(url, proxy = self.myproxy) as response:
return await response.text()

async def get_catalog(self) -> dict[int, tuple[str, str]]:
'''
生成漫画目录 (章节-url 映射)
return:
{ index : (url, capture_title), }
'''
html = await self.get_html(self.catalog_url)
soup = BeautifulSoup(html)
li_tag_list: list[BeautifulSoup] = soup.find('ul', class_ = 'chapter-list clearfix').findAll('li')
catalog = {}
for i, each in enumerate(li_tag_list):
catalog[i + 1] = self.DOMIN + each.a['href'], each.a.string
return catalog

async def save_image(self, pathname: str, data: bytes) -> None:
'''保存一张图片'''
with open(pathname, 'wb') as f:
f.write(data)

async def download_one_image(self, url: str, pathname: str) -> None:
'''下载一张图片'''
async with self.session.get(url) as res:
await self.save_image(pathname, await res.read())

async def download_one_capture_perpage(self, soup: BeautifulSoup, foldername: str) -> None:
'''下载一html页的图片'''
li_tag_list: list[BeautifulSoup] = soup.find('div', class_ = 'comiclist').findAll('div', class_ = 'comicpage')
tasks = []
for each in li_tag_list:
url: str = each.img['src']
self.page_counter += 1
filename = remove_invalid_element_in_windows_path(self.toreload_format_page_name(self.page_counter))
tasks.append(asyncio.ensure_future(self.download_one_image(url, f'{self.outpath_name}/{foldername}/{filename}')))
await asyncio.wait(tasks)


async def download_one_capture(self, url: str, foldername: str) -> None:
'''下载一章图片'''
self.check_path(f'{self.outpath_name}/{foldername}')
self.page_counter = 0
html = await self.get_html(url)
soup = BeautifulSoup(html)
total_pages = int(re.search('第[0-9]+/([0-9]+)页', soup.find('select', class_ = 'selectpage').option.string).group(1))
logger.log('html page 1')
await self.download_one_capture_perpage(soup, foldername)
for i in range(2, total_pages + 1):
logger.log(f'html page {i}')
html = await self.get_html(url.replace('.html', f'?page={i}'))
soup = BeautifulSoup(html)
await self.download_one_capture_perpage(soup, foldername)

@try_except_ensure
async def download_all_caputres(self) -> None:
'''下载漫画所有图片'''
catalog = await self.get_catalog()
work_span_low, work_span_high = self.work_span
flag = work_span_high > work_span_low
logger.debug(f'work span flag:{flag}, low:{work_span_low}, high:{work_span_high}')
for key, value in catalog.items():
if flag:
if key > work_span_high or key < work_span_low:
continue
logger.log(f'章节进度:{key}/{len(catalog)}')
url, capture_title = value
capture_name = remove_invalid_element_in_windows_path(self.toreload_format_capture_name(key, capture_title))
await self.download_one_capture(url, capture_name)


if __name__ == '__main__':
loop = asyncio.get_event_loop()

x = SpiderX(
'https://www.didiaomh.com/manhua/6500.html',
outpath_name = '偷星九月天IMAGES',
)

try:
loop.run_until_complete(x.download_all_caputres())
except Exception as e:
logger.error(f'{type(e)}|{str(e)}')
finally:
loop.close()

#===========================================================================

class TX9MOONSKY(Image_HTMLwrapper_Lv2):
def __init__(
self,
root: str,
html_path: str,
html_vital_element: str,
html_collection_name,
prefix_LvRoot: str,
prefix_Lv2: str,
) -> None:
super().__init__(
root,
html_path,
html_vital_element,
html_collection_name,
prefix_LvRoot,
prefix_Lv2
)

def toreload_parse_diff_element_title(self, folder_name: str) -> str:
return super().toreload_parse_diff_element_title(folder_name)

def toreload_parse_diff_element_P(self, folder_name: str) -> str:
return super().toreload_parse_diff_element_P(folder_name)[3 : ]


y = TX9MOONSKY(
root = '偷星九月天IMAGES',
html_path = '偷星九月天HTMLS',
html_vital_element= '偷星九月天 {arg}',
html_collection_name= ' 偷星九月天 全集 ',
prefix_LvRoot='capture',
prefix_Lv2='image'
)
y.create_htmls()