• 爬取某网站计算机类图书


    网页链接:

    https://www.ptpress.com.cn/shopping/search?tag=search&orderstr=hot&leve11-75424c57-6dd7-4d1f-b6b9-8e95773c0593

    一、为了完成爬取数据,需要进行以下步骤

    1.在浏览器中打开页面,选择"计算机"

    2.可以看到大量的"计算机"相关书籍,右键点击"检查"

    3.刷新页面,点击下一页,查看url

    4.点击"Response",查看json格式中的信息,发现与要爬取的书籍信息一致

    5.划到最低端可以发现计算机类图书查看页数(570页)

    6.查看Date格式

    7.根据书籍详情来爬取相关信息

    二、代码部分

    1.将爬取内容放入打印并放入excel表格中
    1. import requests
    2. import re
    3. import datetime
    4. from time import sleep
    5. import pandas as pd
    6. S = "bookLink"
    7. headers = {
    8. 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
    9. '(KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36 Edg/95.0.1020.44',
    10. }
    11. url = 'https://www.ptpress.com.cn/bookinfo/getBookListForEBTag'
    12. book_info = []
    13. for page in range(1,571):
    14. data = {
    15. 'bookTagId':'a15a734f-0ae9-41d7-9012-6ef9de2e71c8',
    16. 'page':f'{page}',
    17. 'rows':'18',
    18. 'orderStr':'publish'
    19. }
    20. response = requests.post(url,data=data,headers=headers)
    21. data = response.json()
    22. author = data["data"]["data"][0]["author"]
    23. isbn = data["data"]["data"][0]["isbn"]
    24. publish = datetime.datetime.strptime(
    25. data["data"]["data"][0]["publishDate"],"%Y%m"
    26. )
    27. discountPrice = data["data"]["data"][0]["discountPrice"]
    28. bookDiscount = data["data"]["data"][0]["bookDiscount"]
    29. price = data["data"]["data"][0]["price"]
    30. bookId = data["data"]["data"][0]["bookId"]
    31. executiveEditor = data["data"]["data"][0]["executiveEditor"]
    32. bookName = data["data"]["data"][0]["bookName"]
    33. picPath = data["data"]["data"][0]["picPath"]
    34. bookLink = "https://www.ptpress.com.cn/shopping/buy?bookId=" + bookId
    35. book_info.append({
    36. "author":author,
    37. "isbn":isbn,
    38. "publish":publish,
    39. "discountPrice":discountPrice,
    40. "bookDiscount":bookDiscount,
    41. "price":price,
    42. "bookId":bookId,
    43. "executiveEditor":executiveEditor,
    44. "bookName":bookName,
    45. "picPath":picPath,
    46. "bookLink":bookLink
    47. })
    48. print(f"第{page}页爬取成功!")
    49. sleep(1)
    50. print(book_info)
    51. # 将数据保存到Excel文件中
    52. df = pd.DataFrame(book_info)
    53. df.to_excel("book_info.xlsx", index=False)
    爬取结果:

    2.将爬取内容放入打印并放入csv文件中
    1. import requests
    2. import re
    3. import datetime
    4. from time import sleep
    5. import pandas as pd
    6. import csv
    7. S = "bookLink"
    8. headers = {
    9. 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
    10. '(KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36 Edg/95.0.1020.44',
    11. }
    12. url = 'https://www.ptpress.com.cn/bookinfo/getBookListForEBTag'
    13. book_info = []
    14. for page in range(1,571):
    15. data = {
    16. 'bookTagId':'a15a734f-0ae9-41d7-9012-6ef9de2e71c8',
    17. 'page':f'{page}',
    18. 'rows':'18',
    19. 'orderStr':'publish'
    20. }
    21. response = requests.post(url,data=data,headers=headers)
    22. data = response.json()
    23. author = data["data"]["data"][0]["author"]
    24. isbn = data["data"]["data"][0]["isbn"]
    25. publish = datetime.datetime.strptime(
    26. data["data"]["data"][0]["publishDate"],"%Y%m"
    27. )
    28. discountPrice = data["data"]["data"][0]["discountPrice"]
    29. bookDiscount = data["data"]["data"][0]["bookDiscount"]
    30. price = data["data"]["data"][0]["price"]
    31. bookId = data["data"]["data"][0]["bookId"]
    32. executiveEditor = data["data"]["data"][0]["executiveEditor"]
    33. bookName = data["data"]["data"][0]["bookName"]
    34. picPath = data["data"]["data"][0]["picPath"]
    35. bookLink = "https://www.ptpress.com.cn/shopping/buy?bookId=" + bookId
    36. book_info.append({
    37. "author":author,
    38. "isbn":isbn,
    39. "publish":publish,
    40. "discountPrice":discountPrice,
    41. "bookDiscount":bookDiscount,
    42. "price":price,
    43. "bookId":bookId,
    44. "executiveEditor":executiveEditor,
    45. "bookName":bookName,
    46. "picPath":picPath,
    47. "bookLink":bookLink
    48. })
    49. print(f"第{page}页爬取成功!")
    50. sleep(1)
    51. print(book_info)
    52. # 将数据保存到csv文件中
    53. df = pd.DataFrame(book_info)
    54. df.to_csv("人民邮电计算机书本信息.csv", index=False)
    爬取结果:

    3.将爬取内容放入打印并放入MySQL数据库中
    1. import requests
    2. import re
    3. import datetime
    4. from time import sleep
    5. import pandas as pd
    6. import csv
    7. import pymysql
    8. S = "bookLink"
    9. headers = {
    10. 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
    11. '(KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36 Edg/95.0.1020.44',
    12. }
    13. url = 'https://www.ptpress.com.cn/bookinfo/getBookListForEBTag'
    14. book_info = []
    15. for page in range(1,571):
    16. data = {
    17. 'bookTagId':'a15a734f-0ae9-41d7-9012-6ef9de2e71c8',
    18. 'page':f'{page}',
    19. 'rows':'18',
    20. 'orderStr':'publish'
    21. }
    22. response = requests.post(url,data=data,headers=headers)
    23. data = response.json()
    24. author = data["data"]["data"][0]["author"]
    25. isbn = data["data"]["data"][0]["isbn"]
    26. publish = datetime.datetime.strptime(
    27. data["data"]["data"][0]["publishDate"],"%Y%m"
    28. )
    29. discountPrice = data["data"]["data"][0]["discountPrice"]
    30. bookDiscount = data["data"]["data"][0]["bookDiscount"]
    31. price = data["data"]["data"][0]["price"]
    32. bookId = data["data"]["data"][0]["bookId"]
    33. executiveEditor = data["data"]["data"][0]["executiveEditor"]
    34. bookName = data["data"]["data"][0]["bookName"]
    35. picPath = data["data"]["data"][0]["picPath"]
    36. bookLink = "https://www.ptpress.com.cn/shopping/buy?bookId=" + bookId
    37. book_info.append({
    38. "author":author,
    39. "isbn":isbn,
    40. "publish":publish,
    41. "discountPrice":discountPrice,
    42. "bookDiscount":bookDiscount,
    43. "price":price,
    44. "bookId":bookId,
    45. "executiveEditor":executiveEditor,
    46. "bookName":bookName,
    47. "picPath":picPath,
    48. "bookLink":bookLink
    49. })
    50. print(f"第{page}页爬取成功!")
    51. sleep(1)
    52. print(book_info)
    53. # 将数据保存到MySQL数据库中
    54. conn = pymysql.connect(host='localhost', user='root', password='your_password', db='your_database', charset='utf8')
    55. cursor = conn.cursor()
    56. # 创建表格booklist
    57. cursor.execute('CREATE TABLE IF NOT EXISTS booklist (author VARCHAR(255), isbn VARCHAR(255), publish DATE, discountPrice FLOAT, bookDiscount FLOAT, price FLOAT, bookId VARCHAR(255), executiveEditor VARCHAR(255), bookName VARCHAR(255), picPath VARCHAR(255), bookLink VARCHAR(255))')
    58. # 将数据插入到表格booklist中
    59. for book in book_info:
    60. sql = f"INSERT INTO booklist (author, isbn, publish, discountPrice, bookDiscount, price, bookId, executiveEditor, bookName, picPath, bookLink) VALUES ('{book['author']}', '{book['isbn']}', '{book['publish'].strftime('%Y-%m-%d')}', {book['discountPrice']}, {book['bookDiscount']}, {book['price']}, '{book['bookId']}', '{book['executiveEditor']}', '{book['bookName']}', '{book['picPath']}', '{book['bookLink']}')"
    61. cursor.execute(sql)
    62. # 提交事务
    63. conn.commit()
    64. # 关闭连接
    65. cursor.close()
    66. conn.close()
    爬取结果:

  • 相关阅读:
    title标签和meta标签怎样设置?有什么含义?
    leetcode刷题日记:141. Linked List Cycle(环形链表)
    渗透测试突破口——未授权访问
    Java之HikariCP数据库连接池浅入浅出
    PAC工单科目金额明细
    一、认识STM32
    Windows简单安装redis
    图文详解线性回归与局部加权线性回归+房价预测实例
    冲刺第十五届蓝桥杯P0003倍数问题
    在配置文件“tsconfig.json”中找不到任何输入。指定的 “include“ 路径为“[“**/*“]”,“exclude“ 路径为[]
  • 原文地址:https://blog.csdn.net/m0_74972727/article/details/133892237