Python从入门到入土-网络爬虫(BeautifulSoup、lxml解析网页、requests获取网页）

BeautifulSoup

获取所有p标签里的文本

# 获取所有p标签里的文本

# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup

# 在此实现代码
def fetch_p(html):
    soup = BeautifulSoup(html, 'lxml')
    p_list = soup.find_all("p")
    results = [p.text for p in p_list]
    return results

if __name__ == '__main__':
    html = '''
        
            
                这是一个简单的测试页面
            
            
                body 元素的内容会显示在浏览器中。
                title 元素的内容会显示在浏览器的标题栏中。
            
        
        '''
    p_text = fetch_p(html)
    print(p_text)

BeautifulSoup 获取text

# BeautifulSoup 获取text
#
# 获取网页的text

# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup

# 在此实现代码
def fetch_text(html):
    soup = BeautifulSoup(html, 'lxml')
    result = soup.text
    return result

if __name__ == '__main__':
    html = '''
        
            
                这是一个简单的测试页面
            
            
                body 元素的内容会显示在浏览器中。
                title 元素的内容会显示在浏览器的标题栏中。
            
        
        '''
    text = fetch_text(html)
    print(text)

查找网页里所有图片地址

# 查找网页里所有图片地址

from bs4 import BeautifulSoup


# 在此实现代码
def fetch_imgs(html):
    soup = BeautifulSoup(html, 'html.parser')
    imgs = [tag['src'] for tag in soup.find_all('img')]
    return imgs

def test():
    imgs = fetch_imgs(
        '
')
    print(imgs)

if __name__ == '__main__':
    test()

lxml解析网页

使用xpath获取所有段落的文本

# 使用xpath获取所有段落的文本

# -*- coding: UTF-8 -*-
from lxml import etree

# 在此实现代码
def fetch_text(html):
    html = etree.HTML(html)
    result = html.xpath("//p/text()")
    return result

if __name__ == '__main__':
    html = '''
        
            
                这是一个简单的测试页面
            
            
                body 元素的内容会显示在浏览器中。
                title 元素的内容会显示在浏览器的标题栏中。
            
        
        '''
    imgs = fetch_text(html)
    print(imgs)

使用xpath获取所有的文本

# 使用xpath获取所有的文本

# -*- coding: UTF-8 -*-
from lxml import etree

# 在此实现代码
def fetch_text(html):
    html = etree.HTML(html)
    result = html.xpath("//text()")
    return result


if __name__ == '__main__':
    html = '''
        
            
                这是一个简单的测试页面
            
            
                body 元素的内容会显示在浏览器中。
                title 元素的内容会显示在浏览器的标题栏中。
            
        
        '''
    imgs = fetch_text(html)
    print(imgs)

使用xpath获取 class 为 “item-1” 的段落文本

# 使用xpath获取 class 为 "item-1" 的段落文本

# -*- coding: UTF-8 -*-
from lxml import etree

# 在此实现代码
def fetch_text(html):
    html = etree.HTML(html)
    result = html.xpath("//p[@class='item-1']/text()")
    return result


if __name__ == '__main__':
    html = '''
        
            
                这是一个简单的测试页面
            
            
                body 元素的内容会显示在浏览器中。
                title 元素的内容会显示在浏览器的标题栏中。
            
        
        '''
    imgs = fetch_text(html)
    print(imgs)

requests 获取网页

获取url对应的网页HTML

# 获取url对应的网页HTML

# -*- coding: UTF-8 -*-
import requests

# 在此实现代码
def get_html(url):
    response = requests.get(url=url)
    result = response.text
    return result

if __name__ == '__main__':
    url = "http://www.baidu.com"
    html = get_html(url)
    print(html)

requests 获取网页 with headers

# 将url对应的网页下载到本地

# -*- coding: UTF-8 -*-
import requests

def get_html(url, headers=None):
    response = requests.get(url=url)
    return response.text

if __name__ == '__main__':
    # 正确编写 headers
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
    }
    url = "http://www.baidu.com"
    html = get_html(url, headers)
    print(html)

requests post 请求

# requests post 请求

# -*- coding: UTF-8 -*-
import requests

# 在此实现代码
def get_response(url, data, headers=None):
    response = requests.post(url, data, headers)
    result = response.text
    return result

if __name__ == '__main__':
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
    }
    data = {
        "key1": "value1",
        "key2": "value2"
    }
    url = "http://httpbin.org/post"
    html = get_response(url, data, headers)
    print(html)

本文内容到此结束了，
如有收获欢迎点赞👍收藏💖关注✔️，您的鼓励是我最大的动力。
如有错误❌疑问💬欢迎各位指出。
主页：共饮一杯无的博客汇总👨‍💻

保持热爱，奔赴下一场山海。🏃🏃🏃

相关阅读:
shell是什么？ssh 与 git bash linux或cmd与 shell区别
SpringMVC面试题
关于MongoDb查询Decimal128转BigDecimal问题
数学建模笔记
鸿蒙笔记--Socket
【新版】系统架构设计师 - 软件架构设计＜轻量级架构＞
金融行业网络安全保护与三级等保合规实施方案
架构-三层架构：三层架构
让你全方位了解Shell终端，轻松学习
zk的watch机制使用及原理分析

原文地址：https://blog.csdn.net/qq_35427589/article/details/127047373