20230908_python练习_selenium模块爬取网页小说练习

霍比特人小说爬取，使用 selenium 模块调用谷歌浏览器，无界面模式爬取小说网站信息，将数据按照每次2000字符在mysql中保存。
在这里插入图片描述

# https://www.shukuai9.com/b/324694/
# 导入需要的库
from selenium import webdriver
# 导入Keys模块，用于模拟键盘按键操作
from selenium.webdriver.common.keys import Keys
#数据库
import pymysql

#获取文本，拆分为4000字每段
def text_sql(title,dimension,text):
    text = text.replace(chr(9),'').replace(chr(10),'').replace(chr(13),'')
    print(title,dimension,len(text),len(text)//2000)
    for i in range((len(text)//2000)+1):
        str_num = i*2000
        end_num = (i+1)*2000
        #print(str_num,end_num)
        #print(i,text[str_num:end_num])
        try:
            start = mysql_in(title,dimension,i,text[str_num:end_num])
        except Exception as e:
            print(e)
            start='异常'
        print(title,dimension,i,start)

#数据库登录
def mysql_execute(in_sql, leixing):
    # 登录数据库
    conn = pymysql.connect(host='127.0.0.1', port=3306, user='szc_sql', password='szcNSP850219', database='szc_sql',charset='utf8')
    # # 登录数据库
    # dsn = "134.80.200.216/xxx"
    # try:
    #     conn = cx_Oracle.connect(user="zbweb", password="zibo_xxx", dsn=dsn, encoding="UTF-8")
    # except:
    #     time.sleep(10)
    #     conn = cx_Oracle.connect(user="zbweb", password="zibo_xxx", dsn=dsn, encoding="UTF-8")

    # 得到一个可以执行SQL语句的光标对象
    cursor = conn.cursor()
    # 数据库执行导入的语句
    if leixing == '数量':
        # 反馈数量
        count = cursor.execute(in_sql)
    elif leixing == '单条':
        # 反馈单条
        cursor.execute(in_sql)
        count = cursor.fetchone()[0]
    elif leixing == '多条':
        # 反馈多条
        cursor.execute(in_sql)
        count = cursor.fetchall()
    elif leixing == '编辑':
        count = cursor.execute(in_sql)
        conn.commit()
    # 关闭光标对象
    cursor.close()
    # 关闭数据库连接
    conn.close()
    # 反馈
    return count

def mysql_in(title,dimension,num,text):
    print(num,type(num))
    sql="delete from novel_text where title='%s' and dimension='%s' and num='%s' " % (title,dimension,str(num))
    mysql_execute(sql, '编辑')
    sql = "insert into novel_text (title,dimension,num,text) values('%s','%s','%s','%s') " % (title,dimension,str(num),text)
    print('sql',sql)
    mysql_execute(sql, '编辑')
    return '完成'


# 创建Chrome浏览器对象
chrome_opt = webdriver.ChromeOptions()
# 开启无界面模式
chrome_opt.add_argument('--headless')
# 禁用gpu
chrome_opt.add_argument('--disable-gpu')
# 创建Chrome浏览器实例
driver = webdriver.Chrome(executable_path='./chromedriver.exe',options=chrome_opt)
# 打开网页
driver.get("https://www.shukuai9.com/b/324694/")
# 等待页面加载完成
driver.implicitly_wait(10)
# 获取搜索结果列表元素
search_results = driver.find_elements_by_css_selector("dd")
search_title = driver.find_elements_by_css_selector("h1")
print('search_title',search_title)

for text in search_title:
    print('text',text.text)
    search_title_text = text.text

hbt_key = {}
# 输出搜索结果标题和链接
for result in search_results:
    title = result.find_element_by_css_selector("a").text
    link = result.find_element_by_css_selector("a").get_attribute("href")
    num = str(title).index('节')
    title_num = title[1:num]
    #print(title_num,biaoti, link)
    hbt_key[title_num]=[title,link]

#print('hbt_key',hbt_key)

#获取键值对个数
for i in range(len(hbt_key)):
    print(hbt_key[str(i+1)])

    title = hbt_key[str(i + 1)][0]
    link = hbt_key[str(i + 1)][1]
    # 打开网页
    driver.get(link)
    # 等待页面加载完成
    driver.implicitly_wait(10)
    # 获取搜索结果列表元素
    search_results = driver.find_element_by_xpath('//*[@id="content"]').text
    #print(search_results)
    # 输出搜索结果标题和链接
    search_text = ''
    for t in search_results:
        search_text += t
    print(len(search_text),type(search_text))
    #文字留存
    text_sql(search_title_text,title,search_text)
    #break

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125

相关阅读:
软件测试怎么学？App自动化、Web自动化、性能测试怎么学？一文总结
 【LeetCode刷题】2两数相加
 特约｜数码转型思考：Web3.0与银行
 国内“风口”转变，中国游戏公司纷纷“外逃”，东南亚是个好去处
 *（长期更新）软考网络工程师学习笔记——Section 21 防火墙技术原理
 UNet网络制作
 Sqlmap 22.05.22.02
html和css语法记录
 IDERA ER/Studio Data Architect Professional v19.3.2
【Less-CSS】初识Less，使编写 CSS 变得简洁
原文地址：https://blog.csdn.net/szc_1985/article/details/132942330