【学习记录】异步爬虫

。。。

jupyter中额外使用

1 2	import nest_asyncio nest_asyncio.apply()

简易http请求例子

一些前面的东西

import aiohttp
import asyncio
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.63'
}

简易get 请求例子

猜测其他请求可以根据request库类比

携程 text(encoding默认None, 内部自动处理) 返回 aiohttp.ClientResponse类型，文档可查

async with aiohttp.ClientSession() as session:
    async with session.request(
        'GET',
        'https://wldcmzy.github.io/',
        headers = headers
    ) as res:
        print(await res.text(encoding='utf-8'))

等效1

async with aiohttp.ClientSession() as session:
    async with session.get(
        'https://wldcmzy.github.io/',
        headers = headers
    ) as res:
        print(await res.text(encoding='utf-8'))

等效2

session = aiohttp.ClientSession()
res = await session.request(
    'GET',
    'https://wldcmzy.github.io/',
    headers = headers
)
print(await res.text(encoding='utf-8'))
session.close()

aiohttp.ClientSession构造函数可传一些参数

async with aiohttp.ClientSession(headers = headers) as session:
    async with session.get(
        'https://wldcmzy.github.io/',
        headers = headers
    ) as res:
        print(await res.text(encoding='utf-8'))

异步爬虫简易实现

import aiohttp
import asyncio
import time
urls = [
    'https://wldcmzy.github.io/',
    'https://docs.aiohttp.org/en/stable/index.html',
    'https://github.com/Nearrin/HollowKnight.PureZote',
    'https://www.baidu.com/',
]
async def getHTML(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as res:
            print(time.time(), (await res.text())[ : 100], sep = '\n', end = '\n<<<<<<<<<<<<<\n')

loop = asyncio.get_event_loop()
tasks = [asyncio.ensure_future(getHTML(each)) for each in urls]
tasks = asyncio.gather(*tasks)
loop.run_until_complete(tasks)

限制并发数量 asyncio.Semaphore

import aiohttp
import asyncio
import time

URLS = [
    'https://wldcmzy.github.io/',
    'https://docs.aiohttp.org/en/stable/index.html',
    'https://www.baidu.com/',
    'https://blog.csdn.net/mixintu/article/details/102424471',
]

async def getHTML(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as res:
            return url, res.status

async def work(lst, semaphore):
    async with semaphore:
        url, sta = await getHTML(lst)
        print(url, sta)

# async def main(loop: asyncio.AbstractEventLoop) -> None:
async def main() -> None:

    semaphore = asyncio.Semaphore(2)
    tasks = [work(each, semaphore) for each in URLS]
    await asyncio.wait(tasks)
    print('ok')

if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    try:
        st = time.time()
        print('wait', st)
        loop.run_until_complete(main())
        print('over, usetime:', time.time() - st , 's')
        st = time.time()
        print('wait 2nd', st)
        loop.run_until_complete(main())
        print('over, usetime:', time.time() - st , 's')
    finally:
        # loop.close()
        print('closed')

例子

爬漫画，貌似被思路限制了速度的异步爬虫

github: Mess-Mess/spider_didiaomh.com_async.py at master · Wldcmzy/Mess-Mess (github.com)

from bs4 import BeautifulSoup
import re
from Wrappers import logger, remove_invalid_element_in_windows_path, try_except_ensure
from Wrappers.Affix import Affix_OnlyZeroExample
from Wrappers.HTMLwrapper import Image_HTMLwrapper_Lv2
import os
import aiohttp
import asyncio

class SpiderX:
    DOMIN: str = 'https://www.didiaomh.com/'

    def __init__(
        self,
        catalog_url: str,
        outpath_name: str = 'out',
        capture_format: str = 'capture{arg}',
        image_format: str = 'image{arg}.jpg',
        max_capture_number_length: int = 3,
        max_page_number_length: int = 3,
        work_span: tuple[int] = (0, 0),
        concurrency: int = 5,
        myproxy = None,
        # headers: dict = {},
    ) -> None:
        '''
        catalog_url:
            目录页链接
        outpath_name:
            输出路径名
        capture_format:
            章节格式
        image_format:
            图片格式
        max_capture_number_length: 
            最大章节序号长度
        max_page_number_length:
            最大图片序号长度
        work_span: tuple[int]:
            爬取章节范围(小, 大)
            默认爬取所有
        concurrency:
            同时进行的任务数量
        proxies:
            代理服务器信息
        '''
        self.catalog_url = catalog_url if catalog_url[ : len(self.DOMIN)] == self.DOMIN else self.DOMIN + catalog_url
        self.outpath_name = outpath_name
        self.check_path(self.outpath_name)
        self.capture_format = capture_format
        self.image_format = image_format
        self.max_capture_number_length = max_capture_number_length
        self.max_page_number_length = max_page_number_length
        self.work_span = work_span
        self.myproxy = myproxy

        self.page_counter = 0
        self.affix = Affix_OnlyZeroExample()

        self.semaphore = asyncio.Semaphore(concurrency)
        self.session = aiohttp.ClientSession()


    def toreload_format_page_name(self, index: int):
        return self.image_format.format(arg = self.affix.add_prefix(str(index), self.max_page_number_length))
    def toreload_format_capture_name(self, index: int, capture_title: str):
        return self.capture_format.format(arg = self.affix.add_prefix(str(index), self.max_page_number_length) + capture_title)

    def check_path(self, path: str) -> None:
        '''检查目录存在性,若目录没有, 则创建'''
        if not os.path.exists(path):
            os.mkdir(path)
            logger.debug(f'建立目录:{path}...')

    async def get_html(self, url: str) -> str:
        '''抓取网页html信息'''
        async with self.semaphore:
            async with self.session.get(url, proxy = self.myproxy) as response:
                return await response.text()

    async def get_catalog(self) -> dict[int, tuple[str, str]]:
        '''
        生成漫画目录 (章节-url 映射)
        return: 
            { index : (url, capture_title), }
        '''
        html = await self.get_html(self.catalog_url)
        soup = BeautifulSoup(html) 
        li_tag_list: list[BeautifulSoup] = soup.find('ul', class_ = 'chapter-list clearfix').findAll('li')
        catalog = {}
        for i, each in enumerate(li_tag_list):
            catalog[i + 1] = self.DOMIN + each.a['href'], each.a.string 
        return catalog

    async def save_image(self, pathname: str, data: bytes) -> None:
        '''保存一张图片'''
        with open(pathname, 'wb') as f:
            f.write(data)

    async def download_one_image(self, url: str, pathname: str) -> None:
        '''下载一张图片'''
        async with self.session.get(url) as res:
            await self.save_image(pathname, await res.read())

    async def download_one_capture_perpage(self, soup: BeautifulSoup, foldername: str) -> None:
        '''下载一html页的图片'''
        li_tag_list: list[BeautifulSoup] = soup.find('div', class_ = 'comiclist').findAll('div', class_ = 'comicpage')
        tasks = []
        for each in li_tag_list:
            url: str = each.img['src']
            self.page_counter += 1
            filename = remove_invalid_element_in_windows_path(self.toreload_format_page_name(self.page_counter))
            tasks.append(asyncio.ensure_future(self.download_one_image(url, f'{self.outpath_name}/{foldername}/{filename}')))
        await asyncio.wait(tasks)

            
    async def download_one_capture(self, url: str, foldername: str) -> None:
        '''下载一章图片'''
        self.check_path(f'{self.outpath_name}/{foldername}')
        self.page_counter = 0
        html = await self.get_html(url)
        soup = BeautifulSoup(html)
        total_pages = int(re.search('第[0-9]+/([0-9]+)页', soup.find('select', class_ = 'selectpage').option.string).group(1))
        logger.log('html page 1')
        await self.download_one_capture_perpage(soup, foldername)
        for i in range(2, total_pages + 1):
            logger.log(f'html page {i}')
            html = await self.get_html(url.replace('.html', f'?page={i}'))
            soup = BeautifulSoup(html)
            await self.download_one_capture_perpage(soup, foldername)

    @try_except_ensure
    async def download_all_caputres(self) -> None:
        '''下载漫画所有图片'''
        catalog = await self.get_catalog()
        work_span_low, work_span_high = self.work_span
        flag = work_span_high > work_span_low
        logger.debug(f'work span flag:{flag}, low:{work_span_low}, high:{work_span_high}')
        for key, value in catalog.items():
            if flag:
                if key > work_span_high or key < work_span_low:
                    continue
            logger.log(f'章节进度:{key}/{len(catalog)}')
            url, capture_title = value
            capture_name = remove_invalid_element_in_windows_path(self.toreload_format_capture_name(key, capture_title))
            await self.download_one_capture(url, capture_name)


if __name__ == '__main__':
    loop = asyncio.get_event_loop()

    x = SpiderX(
        'https://www.didiaomh.com/manhua/6500.html',
        outpath_name = '偷星九月天IMAGES',
    )
    
    try:
        loop.run_until_complete(x.download_all_caputres())
    except Exception as e:
        logger.error(f'{type(e)}|{str(e)}')
    finally:
        loop.close()

#===========================================================================

    class TX9MOONSKY(Image_HTMLwrapper_Lv2):
        def __init__(
            self, 
            root: str,
            html_path: str,
            html_vital_element: str,
            html_collection_name,
            prefix_LvRoot: str, 
            prefix_Lv2: str,
        ) -> None:
            super().__init__(
                root, 
                html_path, 
                html_vital_element, 
                html_collection_name,
                prefix_LvRoot,
                prefix_Lv2
            )
        
        def toreload_parse_diff_element_title(self, folder_name: str) -> str:
            return super().toreload_parse_diff_element_title(folder_name)
        
        def toreload_parse_diff_element_P(self, folder_name: str) -> str:
            return super().toreload_parse_diff_element_P(folder_name)[3 : ]


    y = TX9MOONSKY(
        root = '偷星九月天IMAGES',
        html_path = '偷星九月天HTMLS',
        html_vital_element= '偷星九月天 {arg}',
        html_collection_name= ' 偷星九月天 全集 ',
        prefix_LvRoot='capture',
        prefix_Lv2='image'
    )
    y.create_htmls()

StarsWhisper

。。。

简易http请求例子

一些前面的东西

简易get 请求 例子

异步爬虫简易实现

例子

简易get 请求例子