今天介绍将HTML网页抓取下来,然后以PDF保存,也可以将自己写好生成的HTML转成PDF。废话不多说直接进入教程。
conda config --add channels conda-forge
conda config --add channels microsoft
conda install playwright
playwright install
# 安装playwright
pip install playwright
# 安装browser驱动
playwright install
playwright pdf html文件路径 PDF输出路径
playwright pdf ./baidu.html ./baidu.pdf
playwright pdf --viewport-size=800,600 ./baidu.html ./baidu.pdf
playwright pdf --timezone="Asia/Shanghai" --geolocation="30.890221,120.492348" --lang="zh-CN" ./baidu.html ./baidu.pdf
from playwright.sync_api import sync_playwright
# 获取要保存的 URL 列表
urls = []
with open('urls.txt', mode='rt', encoding='utf-8') as f:
urls = f.readlines()
# 访问目标 URL 列表并另存为 PDF
with sync_playwright() as p:
browser = p.chromium.launch()
for i,url in enumerate(urls):
context = browser.new_context()
page = context.new_page()
page.goto(url)
page.pdf(path=f"{i}.pdf")
browser.close()