代码拿去就可以用,输入你要爬的贴吧名就行了。
看贴吧有多少页,range范围改为多少
爬取的评论在D:/res/info.txt中,注意控制访问频率,不要把time.sleep()删了,要不然ip会被被临时封了,也可能被验证码疯狂骚扰,如果你有ip代理池那随意了
- import requests
- import os
- import numpy as np
- from concurrent.futures import ThreadPoolExecutor
- import time
- import re
- header={
- 'user-agent':
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.76'
- }
-
- #with open('D:/res/headers.txt','r',encoding='utf-8')as f:
- # headers=f.readlines()
-
- #proxies=[]
- #with open('D:/res/https.txt','r',encoding='utf-8') as f:
- # proxies=f.readlines()
- proxy={'http':'http://106.11.226.232:8009','https':'http://106.11.226.232:8009'}
- def fun(url,page):
- time.sleep(10)
- #proxy={'https':'http://'+np.random.choice(proxies),'http':'http://'+np.random.choice(proxies)}
- r=requests.get(url,headers=header,timeout=10,proxies=proxy,verify=False,allow_redirects=False)
- if r.status_code!=200:
- print('error')
- print(r.text)
- hrefs=re.findall('href=".*?" title=".*?" target="_blank" class="j_th_tit ">',r.text)
- print(hrefs)
- print(page)
- if len(hrefs)==0:
- return
-
- for href in hrefs:
- time.sleep(5)
-
- href='https://tieba.baidu.com'+href[6:19]
-
- #proxy={'https':'http://'+np.random.choice(proxies),'http':'http://'+np.random.choice(proxies)}
- r=requests.get(href,headers=header,timeout=10,proxies=proxy,verify=False,allow_redirects=False)
-
- print(r.encoding)
- r.encoding='utf-8'
- print(page)
- texts=re.findall('class="d_post_content j_d_post_content clearfix" style="display:;"> .*?<',r.text)
- print(texts)
- prefix='class="d_post_content j_d_post_content clearfix" style="display:;"> '
- for i in range(len(texts)):
- texts[i]=texts[i][len(prefix):-1]
-
- if not os.path.exists('D:/res'):
- os.makedirs('D:/res')
-
- with open('D:/res/info.txt','a',encoding='utf-8') as f:
-
- f.write(' '.join(texts))
-
-
- #pool=ThreadPoolExecutor(2)
- #500
-
- topic=input()
- for page in range(0,1000):
- url=f'https://tieba.baidu.com/f?kw={topic}&ie=utf-8&pn={page*50}'
- #pool.submit(fun,url)
- fun(url,page)