爬某漫画网站的Dr.STONE漫画

?

起因

最近在看Mr.STONE,但是没有人做剧透,只好自己去看漫画

其他

一开始写了个同步,但感觉挺慢,于是随便搜了点资料尝试换成异步,以前还没接触过异步爬虫捏,大概是方法错了,没感觉有显著的加速,不响学辣,卷考研去乐。

github: Mess-Mess/spider_DrSTONE at master · Wldcmzy/Mess-Mess (github.com)

3.0版本

由于之前漫画质量不是很好,翻译整合了好几家,繁简都有,而且存在章节错位,于是把download_1.0and2.0.py进行了换源重置,并和pic_to_url.py的功能做了整合,成为download_3.0.py。换源后章节错位不存在了,但还是繁简都有(厚礼蟹)。

后来发现换源后有几章网站那边图片404,可以通过1.0、2.0版本的成果互补。404check.py用于检测哪几章崩了,原理是检测目录中是否有md5值重复的图片,有时会误伤(比如网站抽风一张图放两次)。其他没写的思路:若404图片都是一张(确实都是一张),可以检测特定哈希值。

代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
import requests
import re
import bs4
import time
import os
import typing

class MyEasylogger:
def __init__(self, path: str = './', filename = 'DownloadLOG.log') -> None:
self.path = path
self.filename = filename
self.makelog('MyEasylogger类被实例化', 'MyEasylogger', False)

def makelog(self, data: str, tag: str = 'default', ifprint: bool = True) -> None:
if ifprint:
print(data)
ifexist = os.path.exists(self.filename)
with open(self.path + self.filename, 'a+') as f:
if not ifexist:
f.write(f'==> {str(time.localtime()[ : 6])} | [MyEasylogger] 日志建立\n')
f.write(f'==> {str(time.localtime()[ : 6])} | [{tag}] {data}\n')

def log(self, data: str, ifprint = True):
self.makelog(data, 'log', ifprint)

def debug(self, data: str, ifprint = True):
self.makelog(data, 'debug', ifprint)

def error(self, data: str, ifprint = True):
self.makelog(data, 'error', ifprint)
logger = MyEasylogger()

def try_except_ensure(func):
def _(*args):
try:
return func(*args)
except Exception as e:
def __(e : Exception, *args) -> None:
logger.error(str(type(e)) + '|' + str(e))
return __(e, *args)
return _

class Downloader:
DOMIN = 'http://rimanb.com/'

HTML = '''
<!DOCTYPE html>
<html>
<head>
<title>{capture}</title>
<meta charset="utf-8">
<style>
body{{
background-color: black;
}}
div#pages{{
text-align: center;
}}
p#p{{
font-size: 35px;
text-align: center;
color: #66ffff;
}}
span#capture_number{{
color: #ff66ff;
}}
</style>

</head>
<body>
{args}
</body>
</html>
'''
IMAGE = ' <div id="pages"><img onload="if(this.width >= document.documentElement.clientWidth){{this.width = document.documentElement.clientWidth}}" align="middle" src="{page}"></img></div>\n'
P = ' <p id="p"><strong>{caricature_name} 第 <span id="capture_number">{p}</span> 话</strong></p>\n'
SEP = '<br><br>\n <hr style="FILTER:alpha(opacity=100,finishopacity=0,style=3)" width="95%"color=#00FF7F SIZE=5>'
def __init__(
self,
catalog_url: str = 'http://rimanb.com/book/2407',
start_end: tuple[int, int] = (39, 232),
image_format: str = '.jpg',
outpath_root: str = './out/',
htmlpath: str = './html/',
caricature_name: str = 'Mr.STONE石纪元',
page_number_length:int = 2,
capture_number_length: int = 3,
headers: dict = {
'User-Agent': r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.47',
},
) -> None:
'''
catalog_url:
漫画的章节列表页面url
start_end:
爬取漫画的起始和终止章节
image_format:
图片保存格式
outpath_root:
输出根路径, 以/结尾
htmlpath:
以便于观看的html文件整合图片,html文件的输出路径
caricature_name:
漫画名称, 用于html要素命名
page_number_length:
命名图片序号所需的字符串长度
capture_number_length:
命名章节序号所需的字符串长度
headers:
请求头信息
'''
self.start, self.end = start_end
self.catalog_url = catalog_url
self.image_format = image_format
self.outpath_root = outpath_root
self.htmlpath = htmlpath
self.caricature_name = caricature_name
self.page_number_length = page_number_length
self.capture_number_length = capture_number_length
self.headers = headers
if not os.path.exists(self.outpath_root):
os.mkdir(self.outpath_root)
logger.debug(f'输出目录{self.outpath_root}不存在, 已经成功建立...')

def prefix_zero(self, ss: typing.Union[str, int], leng: int) -> str:
'''
为字符串ss填充前导0使其达到长度leng
ss:
初始字符串
leng:
目标长度
'''
ss = str(ss)
assert len(ss) <= leng
if len(ss) < leng:
ss = '0' * (leng - len(ss)) + ss
return ss

def format_page_number(self, page: str) -> None:
'''
把页码前面补0扩展到正确长度
page:
页码字符串
'''
return self.prefix_zero(page, self.page_number_length)

def format_capture_number(self, cap: str) -> None:
'''
把章节前面补0扩展到正确长度
cap:
章节码字符串
'''
return self.prefix_zero(cap, self.capture_number_length)

def get_url_dict(self) -> dict[int, str]:
'''
获得漫画每一章的url
return: dic
漫画的所有章节的章节号-url映射表
{章节号 : url}
'''
res = requests.get(self.catalog_url, headers = self.headers)
soup = bs4.BeautifulSoup(res.text)
lst = soup.findAll('span', class_ = 'works-chapter-item')
dic = {}
for each in lst:
id = re.search('第([0-9]+)话', str(each.a['title'])).group(1)
href = each.a['href']
dic[int(id)] = Downloader.DOMIN + href
return dic

def parse_capture(self, url: str) -> list[tuple[int, str]]:
'''
获取漫画特定章节的所有图片的链接
url:
漫画特定章节的url
return: lst
url对应漫画章节的所有图片链接
[(图片序号, url), ]
'''
res = requests.get(url, headers = self.headers)
soup = bs4.BeautifulSoup(res.text)
imgs = soup.find('ul', id = 'comicContain', class_ = 'comic-contain').findAll('li')
lst = []
for i, each in enumerate(imgs):
lst.append((i + 1, Downloader.DOMIN + each.img['src']))
return lst

def download_one_picture(self, url: str, filepath: str, filename: str) -> None:
'''
下载一张图片
url:
图片地址
filepath:
文件路径, 以"/"结尾
filename:
文件名
'''
res = requests.get(url, headers = self.headers)
with open(f'{filepath}{filename}.{self.image_format}', 'wb') as f:
f.write(res.content)
logger.log(f'下载图片并保存为{filepath}{filename}.{self.image_format}完成...')

@try_except_ensure
def run(self) -> None:
'''
下载图片
下载的漫画链接为: self.catalog_url
下载的起始章节为: self.start
下载的结束章节为: self.end
下载的图片保存为: self.outpath_root/capture章节号/page图片号.self.imageformat
'''
logger.log(f'开始下载,漫画目录s链接为:{self.catalog_url}')
logger.log('正在获取章节目录...')
self.url_dict = self.get_url_dict()
logger.log(f'章节目录获取完成,共{len(self.url_dict)}个项目...')
for capture in range(self.start, self.end + 1):
logger.log(f'!!!章节进度: {capture}({self.start}-{self.end}) ({capture - self.start + 1}/{self.end - self.start + 1})')
outpath = f'{self.outpath_root}capture{self.format_capture_number(capture)}/'
if not os.path.exists(outpath):
logger.debug(f'创建目录{outpath}')
os.mkdir(outpath)
if capture not in self.url_dict:
logger.error(f'在章节目录中没有第{capture}章的记录(KeyError:{capture})')
continue
image_list = self.parse_capture(self.url_dict[capture])
for pid, purl in image_list:
logger.log(f'!图片进度: ({pid}/{len(image_list)})')
self.download_one_picture(purl, outpath,'page' + self.format_page_number(pid))
logger.log('下载结束...')

def combie_one_caputre_images_to_html(self, folder_name: int) -> str:
'''
把一整章的图片链接都整合为html语言图片块
folder_name:
特定章节对应的文件夹名
return: html
拥有folder_name文件夹中所有以page链接的html文件格式字符串
'''
pages = [each for each in os.listdir(self.outpath_root + folder_name) if each[ : 4] == 'page']
pages = sorted(pages)
html = ''
image_src = '../' + self.outpath_root + folder_name + '/'
for each in pages:
html += Downloader.IMAGE.format(page = image_src + each)
return html

def create_one_html(self, name: str, html: str, collection: bool = False) -> None:
'''
生成一章漫画的html文件
name:
要生成的html文件名称
html:
本章漫画图片的html格式字符串
collection:
本次是否生成总集
'''
index = f'第{int(name[7 : ])}话' if not collection else '总集篇'
with open(self.htmlpath + self.caricature_name + '_' + name + '.html', 'w', encoding='utf-8') as f:
f.write(Downloader.HTML.format(capture = f'{self.caricature_name} {index}', args = html))

def create_htmls(self, collection: bool = True) -> None:
'''
为漫画所有章节生成html文件
collection:
是否生成总集篇文件
'''
if not os.path.exists(self.htmlpath):
os.mkdir(self.htmlpath)
logger.debug(f'{self.htmlpath}不存在, 建立成功')
logger.log(f'开始为漫画创建html文件, {"" if collection else "不"}包括创建总集篇')
capture_folders = [each for each in os.listdir(self.outpath_root) if each[ : 7] == 'capture']
totalhtml = ''
for each in capture_folders:
html = Downloader.SEP
html += Downloader.P.format(p = int(each[7 : ]), caricature_name = self.caricature_name)
html += self.combie_one_caputre_images_to_html(each)
self.create_one_html(each, html)
logger.log(each + 'html创建完成...')
if collection:
totalhtml += html
if collection:
self.create_one_html('ALL CAPTURES', totalhtml, True)
logger.log('总集篇html创建完成...')


if __name__ == '__main__':
d = Downloader()
d.run()
d.create_htmls()

404check

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import os
import hashlib

path = 'out/'
folders = os.listdir(path)
lst404 = []
for folder in folders:
pages = os.listdir(path + folder + '/')
dic = {}
for page in pages:
with open(path + folder + '/' + page, 'rb') as f:
hasher = hashlib.md5(f.read())
hash = hasher.hexdigest()
if hash not in dic:
dic[hash] = 1
else:
lst404.append(folder)
print(folder)
break
with open('404captures.txt', 'w') as f:
for each in lst404:
f.write(each + '\n')

其他历史版本

代码(有漏洞版本)

漏洞为:直接按照两相邻的章节之间的差异设置批量爬取逻辑,忽略了个别中间的无序章节,导致从最早的无序章节开始至以后的章节全部错误,即使后期顺序关系恢复的,也无法正确爬取。

解决方案:(见代码修正版本)先获取从上级页面获取章节目录,得到每一章节和其链接的映射表,查表定位。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154

from asyncio.log import logger
import requests
from bs4 import BeautifulSoup
import re
from time import localtime
from os import mkdir
from os.path import exists
import asyncio

class Mylogger:
def __init__(self) -> None:
pass

# def inactive(self) -> None:
# self.fp.close()

def log(self, data: str, ifprint = True) -> None:
if ifprint:
print(data)

#换异步后可能报错 log可禁用
#return None
with open('DownloadLOG.log', 'a+') as f:
f.write(f'==> {str(localtime()[ : 6])} | {data}\n')

class Downloader:

DrSTONE_ADDR = 25638 - 1

# 页码长度
PAGE_NUMBER_LENGTH = 2

# 章节长度
CAPTURE_NUMBER_LENGTH = 3

# 起始,截至章节
CAPTURE_START, CAPTURE_END = 1, 233

# 基础漫画地址
BASE_URL = r'https://omyschool.com/article_detail/152/{addr}/Dr.STONE%20%E7%9F%B3%E7%BA%AA%E5%85%83/{cap}%E8%A9%B1/'

# 请求头信息
MY_HEADERS = {
'User-Agent': r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.47',

}

# 保存路径
PATH = './out/'

def __init__(self) -> None:
self.logger = Mylogger()
if not exists(Downloader.PATH):
mkdir(Downloader.PATH)

def prefix_zero(self, ss: str, leng: int) -> str:
'''为字符串ss填充前导0使其达到长度leng'''
ss = str(ss)
if len(ss) < leng:
ss = '0' * (leng - len(ss)) + ss
return ss

def format_page_number(self, page: str) -> None:
'''
把页码前面补0扩展到正确长度
page: 页码字符串
'''
return self.prefix_zero(page, Downloader.PAGE_NUMBER_LENGTH)

def format_capture_number(self, cap: str) -> None:
'''
把章节前面补0扩展到正确长度
cap: 章节码字符串
'''
return self.prefix_zero(cap, Downloader.CAPTURE_NUMBER_LENGTH)

def form_capture_url(self, capture: int) -> str:
'''
返回第capture章的网页地址
capture: 章序号
'''
str_capture = str(capture)
str_addr = str(Downloader.DrSTONE_ADDR + capture)
if capture < 10: str_capture = '0' + str_capture
return Downloader.BASE_URL.format(cap = str_capture, addr = str_addr)

async def download_one_picture(self, url: str, filepath: str, filename: str) -> None:
'''
下载一张图片
url: 图片地址
filepath: 文件路径, 以"/"结尾
filenamer: 文件名
page: 页码号 用于显示进度
'''
try:
fileformat = url.split('.')[-1]
res = requests.get(url, headers=Downloader.MY_HEADERS)
with open(f'{filepath}{filename}.{fileformat}', 'wb') as f:
f.write(res.content)
print(f'{filepath}{filename}.{fileformat}完成...')
except Exception as e:
self.logger.log(f'{type(e)} | {str(e)} | 下载图片{filepath}{filename}时出错。')

def parse_one_capture(self, capture_number: int) -> list[tuple[str, str]]:
'''
解析第capture_number章的内容, 返回这一章所有图片的链接和序号
capture_number: 章序号
'''
try:
url = self.form_capture_url(capture_number)
print(url)
res = requests.get(url, headers=Downloader.MY_HEADERS)
soup = BeautifulSoup(res.text.replace('amp-img', 'amp_img'))
data_list = soup.find_all('div', {'data-id': True})
image_list = []
for each in data_list:
picture_url = each.amp_img['src']
picture_id = re.search('第([0-9]+)页', each.amp_img['alt']).group(1)
image_list.append((picture_url, picture_id))
return image_list
except Exception as e:
self.logger.log(f'{type(e)} | {str(e)} | 解析第{capture_number}章链接时出错,可能部分或全部图片未正确下载。')
return []

def run(self):
'''主方法'''
#try:
print('开始下载...')
for capture in range(Downloader.CAPTURE_START, Downloader.CAPTURE_END + 1):
print(f'章进度:{capture - Downloader.CAPTURE_START + 1} / {Downloader.CAPTURE_END + 1 - Downloader.CAPTURE_START}')
path = f'{Downloader.PATH}capture{self.format_capture_number(capture)}/'
if not exists(path):
mkdir(path)
image_list = self.parse_one_capture(capture)
if not image_list: continue
tasks = []
for picture_url, picture_id in image_list:
tasks.append(asyncio.ensure_future(
self.download_one_picture(
picture_url,
path,
'page' + self.format_page_number(picture_id)
)
))
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
print('下载结束...')
# except Exception as e:
# self.logger.log(f'{type(e)} | {str(e)} | run方法遇到未知错误')

if __name__ == '__main__':
d = Downloader()
d.run()

代码(修正版本)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195

from asyncio.log import logger
import requests
from bs4 import BeautifulSoup
import re
from time import localtime
from os import mkdir
from os.path import exists
import asyncio

class Mylogger:
def __init__(self) -> None:
pass

# def inactive(self) -> None:
# self.fp.close()

def log(self, data: str, ifprint = True) -> None:
if ifprint:
print(data)

#换异步后可能报错 log可禁用
#return None
with open('DownloadLOG.log', 'a+') as f:
f.write(f'==> {str(localtime()[ : 6])} | {data}\n')

class Downloader:

#DrSTONE_ADDR = 25638 - 1
DrSTONE_ADDR = 239049 - 1

# 页码长度
PAGE_NUMBER_LENGTH = 2

# 章节长度
CAPTURE_NUMBER_LENGTH = 3

# 起始,截至章节
CAPTURE_START, CAPTURE_END = 136, 233

# 基础漫画地址
BASE_URL = r'https://omyschool.com/article_detail/152/{addr}/Dr.STONE%20%E7%9F%B3%E7%BA%AA%E5%85%83/{cap}%E8%A9%B1/'

# 请求头信息
MY_HEADERS = {
'User-Agent': r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.47',

}

# 保存路径
PATH = './out/'

def __init__(self) -> None:
self.logger = Mylogger()
if not exists(Downloader.PATH):
mkdir(Downloader.PATH)

def prefix_zero(self, ss: str, leng: int) -> str:
'''为字符串ss填充前导0使其达到长度leng'''
ss = str(ss)
if len(ss) < leng:
ss = '0' * (leng - len(ss)) + ss
return ss

def format_page_number(self, page: str) -> None:
'''
把页码前面补0扩展到正确长度
page: 页码字符串
'''
return self.prefix_zero(page, Downloader.PAGE_NUMBER_LENGTH)

def format_capture_number(self, cap: str) -> None:
'''
把章节前面补0扩展到正确长度
cap: 章节码字符串
'''
return self.prefix_zero(cap, Downloader.CAPTURE_NUMBER_LENGTH)

def form_capture_url(self, capture: int) -> str:
'''
返回第capture章的网页地址
capture: 章序号
'''
str_capture = str(capture)
str_addr = str(Downloader.DrSTONE_ADDR + capture)
if capture < 10: str_capture = '0' + str_capture
return Downloader.BASE_URL.format(cap = str_capture, addr = str_addr)

async def download_one_picture(self, url: str, filepath: str, filename: str) -> None:
'''
下载一张图片
url: 图片地址
filepath: 文件路径, 以"/"结尾
filenamer: 文件名
page: 页码号 用于显示进度
'''
try:
#print(url)
fileformat = url.split('.')[-1]
res = requests.get(url, headers=Downloader.MY_HEADERS)
with open(f'{filepath}{filename}.{fileformat}', 'wb') as f:
f.write(res.content)
print(f'{filepath}{filename}.{fileformat}完成...')
except Exception as e:
self.logger.log(f'{type(e)} | {str(e)} | 下载图片{filepath}{filename}时出错。')

def parse_one_capture(self, capture_number: int) -> list[tuple[str, str]]:
'''
解析第capture_number章的内容, 返回这一章所有图片的链接和序号
capture_number: 章序号
'''
try:
#url = self.form_capture_url(capture_number)
url = self.NEW__form_capture_url(capture_number)
print(url)
res = requests.get(url, headers=Downloader.MY_HEADERS)
soup = BeautifulSoup(res.text.replace('amp-img', 'amp_img'))
data_list = soup.find_all('div', {'data-id': True})
image_list = []
for each in data_list:
picture_url = each.amp_img['src']
picture_id = re.search('第([0-9]+)页', each.amp_img['alt']).group(1)
image_list.append((picture_url, picture_id))
return image_list
except Exception as e:
self.logger.log(f'{type(e)} | {str(e)} | 解析第{capture_number}章链接时出错,可能部分或全部图片未正确下载。')
return []

def run(self):
'''主方法'''

self.link_dict = self.NEW_get_random_capture_url()


#try:
print('开始下载...')
for capture in range(Downloader.CAPTURE_START, Downloader.CAPTURE_END + 1):
print(f'章进度:{capture - Downloader.CAPTURE_START + 1} / {Downloader.CAPTURE_END + 1 - Downloader.CAPTURE_START}')
path = f'{Downloader.PATH}capture{self.format_capture_number(capture)}/'
if not exists(path):
mkdir(path)
image_list = self.parse_one_capture(capture)
if not image_list: continue
tasks = []
for picture_url, picture_id in image_list:
tasks.append(asyncio.ensure_future(
self.download_one_picture(
picture_url,
path,
'page' + self.format_page_number(picture_id)
)
))
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
print('下载结束...')
# except Exception as e:
# self.logger.log(f'{type(e)} | {str(e)} | run方法遇到未知错误')



#def NEW__get_random_capture_url() -> list[tuple[str, str]]:
def NEW_get_random_capture_url(self) -> dict[int, str]:
'''获取Mr.STONE所有章节的链接'''
print('获取章节目录开始...')
DOMIN = 'https://omyschool.com'
DrSTONE_url = r'https://omyschool.com/article_list/152/Dr.STONE%20%E7%9F%B3%E7%BA%AA%E5%85%83/'
res = requests.get(DrSTONE_url, headers=Downloader.MY_HEADERS)
soup = BeautifulSoup(res.text)
lst = soup.findAll('div', {"class": "chapter"})
#ret = []
ret_d = {}
for each in lst:
href = each.a["href"]
if 'Dr.STONE' in href:
id = re.search('([0-9]+)[话|話]', str(each)).group(1)
# ret.append((DOMIN + href, id))
# return sorted(ret, key = lambda x: int(x[1]))
ret_d[int(id)] = DOMIN + href
print('获取章节目录完成...')
return ret_d

def NEW__form_capture_url(self, capture: int) -> str:
'''
返回第capture章的网页地址
capture: 章序号
'''
return self.link_dict[capture]




if __name__ == '__main__':
d = Downloader()
d.run()

EXTRA

字比较小,一张一张图放大看太麻烦了,于是在本地弄成网页,直接滚鼠标就行,还能直接看大图。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import os

PICTURE_PATH = 'out/'
HTML_PATH = 'html/'

def make_one_capture_to_html(folder_name: str) -> None:
pages = [each for each in os.listdir(PICTURE_PATH + folder_name) if each[ : 4] == 'page']
pages = sorted(pages)
html = '''
<!DOCTYPE html>
<html>
<head>
<title>temp</title>
<meta charset="utf-8">
<style>
div#MrSTONEpages{{
text-align: center;
}}
</style>

</head>
<body>
{images}
</body>
</html>
'''
img = ' <div id="MrSTONEpages"><img onload="if(this.width >= document.documentElement.clientWidth){{this.width = document.documentElement.clientWidth}}" align="middle" src="{page}"></img></div>\n'
imgs = ''
RELATIVE_PATH = '../' + PICTURE_PATH + folder_name + '/'
for each in pages:
imgs += img.format(page = RELATIVE_PATH + each)
with open(HTML_PATH + folder_name + '.html', 'w', encoding='utf-8') as f:
f.write(html.format(images = imgs))



if __name__ == '__main__':
capture_folders = [each for each in os.listdir(PICTURE_PATH) if each[ : 7] == 'capture']
for each in capture_folders:
make_one_capture_to_html(each)