• 异步爬小说


    异步的最好案例应该是爬这种多页面的或者多级的东西,所以直接整小说

    直接百度小说

    随便点一本小说,然后打开调试

    找到加载章节的数据包

    然后保存这个包的url地址

    点一个章节进去,然后又看一下包

     

     看到了内容,url也复制一下

    1. # https://dushu.baidu.com/api/pc/getCatalog?data={%22book_id%22:%224356290733%22}
    2. # https://dushu.baidu.com/api/pc/getChapterContent?data={%22book_id%22:%224356290733%22,%22cid%22:%224356290733|1569830905%22,%22need_bookinfo%22:1}

    发现有%22这玩意,直接去掉

    然后对第一个url进行请求

    1. def getCatalog(url): # url 传参
    2. resp = requests.get(url)
    3. print(resp.text) # 同步以获取小说信息
    4. if __name__ == '__main__':
    5. bok_id = "4356290733"
    6. url = 'https://xxxxx.com/api/pc/getCatalog?data={"book_id":"' + bok_id + '"}'
    7. getCatalog(url)

     

    因为是数据包,所以直接用json输出,然后定位到title的位置,以获取title和cid

     

    因为每个cid对应每一章节,所以开始上异步

    修改代码下

    1. async def aiodownload(cid,bok_id,title):
    2. data = {"book_id":"4356290733","cid":"4356290733|1569830905","need_bookinfo":1}
    3. pass
    4. async def getCatalog(url):
    5. resp = requests.get(url)
    6. # print(resp.text)
    7. dic = resp.json()
    8. tasks = []
    9. for item in dic['data']['novel']['items']:
    10. title = item['title']
    11. cid = item['cid']
    12. # 准备异步任务
    13. tasks.append(aiodownload(cid,bok_id,title))
    14. # print(title,cid)
    15. await asyncio.wait(tasks)
    16. if __name__ == '__main__':
    17. bok_id = "4356290733"
    18. url = 'https://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"' + bok_id + '"}'
    19. asyncio.run(getCatalog(url))

     仔细看一下data那一行的数据,可以改成

    data = {"book_id":bok_id,"cid":f"{bok_id}|{cid}","need_bookinfo":1}

     对于第一个url就是获取所有章数的,就是如下所示了

    1. async def aiodownload(cid,bok_id,title):
    2. data = {"book_id":bok_id,"cid":bok_id|cid,"need_bookinfo":1}
    3. data = json.dumps(data) # 改为字符串
    4. url = f'https://xxxx.com/api/pc/getChapterContent?data={data}'
    5. async with aiohttp.ClientSession() as sesion:
    6. async with sesion.get(url) as resp:
    7. dic = await resp.json()
    8. async def getCatalog(url):
    9. resp = requests.get(url)
    10. # print(resp.text)
    11. dic = resp.json()
    12. tasks = []
    13. for item in dic['data']['novel']['items']:
    14. title = item['title']
    15. cid = item['cid']
    16. # 准备异步任务
    17. tasks.append(aiodownload(cid,bok_id,title))
    18. # print(title,cid)
    19. await asyncio.wait(tasks)
    20. if __name__ == '__main__':
    21. bok_id = "4356290733"
    22. url = 'https://xxx.xxx.com/api/pc/getCatalog?data={"book_id":"' + bok_id + '"}'
    23. asyncio.run(getCatalog(url))

    内容就简单了

    1. dic = await resp.json()
    2. async with aiofiles.open(title,mode="w",encoding="utf-8") as f:
    3. await f.write(dic['data']['novel']['content']) # 把小说内容写入

    全部代码如下

    1. import asyncio, aiohttp, aiofiles
    2. import json
    3. import requests
    4. async def aiodownload(cid, bok_id, title):
    5. data = {"book_id": bok_id, "cid": f"{bok_id}|{cid}", "need_bookinfo": 1}
    6. data = json.dumps(data) # 改为字符串
    7. url = f'https://xxxx.com/api/pc/getChapterContent?data={data}'
    8. async with aiohttp.ClientSession() as sesion:
    9. async with sesion.get(url) as resp:
    10. dic = await resp.json()
    11. async with aiofiles.open(title, mode="w", encoding="utf-8") as f:
    12. await f.write(dic['data']['novel']['content']) # 把小说内容写入
    13. async def getCatalog(url):
    14. resp = requests.get(url)
    15. # print(resp.text)
    16. dic = resp.json()
    17. tasks = []
    18. for item in dic['data']['novel']['items']:
    19. title = item['title']
    20. cid = item['cid']
    21. # 准备异步任务
    22. tasks.append(aiodownload(cid, bok_id, title))
    23. # print(title,cid)
    24. await asyncio.wait(tasks)
    25. if __name__ == '__main__':
    26. bok_id = "4356290733"
    27. url = 'https://xxx.com/api/pc/getCatalog?data={"book_id":"' + bok_id + '"}'
    28. asyncio.run(getCatalog(url))

  • 相关阅读:
    STM32 CAN过滤器标识符学习笔记
    SQL触发器
    detect.py和train.py的参数解释
    Linux高性能服务器I/0高级应用:非阻塞connect(15)
    ESP32C3 LuatOS TM1650①驱动测试
    新加坡打车软件平台Ryde Group申请1700万美元纳斯达克IPO上市
    少儿编程 电子学会图形化编程等级考试Scratch三级真题解析(判断题)2022年6月
    Qt软键盘使用和修改软键盘参数 支持中文
    数据结构绪论思维导图
    如何获取文本中出现的所有单词
  • 原文地址:https://blog.csdn.net/m0_66060262/article/details/127134136