• JAVA爬虫系列


    目录

    准备工作

    yml

    1.入门程序(获取到静态页面)

    2.HttpClient---Get

    2.1 修改成连接池

    3.HttpClient---Get带参数

    3.1 修改成连接池

    4.HttpClient---Post

    4.1 修改成连接池

    5.HttpClient---Post带参数

    6.HttpClient-连接池

    7.设置请求信息

    8.jsoup介绍.

    9.jsoup解析url

    10.jsoup解析字符串

    11.jsoup解析文件

    12.所有dom方式获取元素

    13.元素中获取数据


    准备工作

    导入依赖

    1. <dependency>
    2. <groupId>org.apache.httpcomponentsgroupId>
    3. <artifactId>httpclientartifactId>
    4. <version>4.5.2version>
    5. dependency>

    yml

    1. logging:
    2. level:
    3. root: info
    4. com.lrm: debug

    1.入门程序(获取到静态页面)

    1. package com.itheima.reggie.utils;
    2. import org.apache.http.HttpEntity;
    3. import org.apache.http.client.methods.CloseableHttpResponse;
    4. import org.apache.http.client.methods.HttpGet;
    5. import org.apache.http.impl.client.CloseableHttpClient;
    6. import org.apache.http.impl.client.HttpClients;
    7. import org.apache.http.util.EntityUtils;
    8. import java.io.IOException;
    9. /**
    10. * @Author lpc
    11. **/
    12. public class CrawlerFirst {
    13. public static void main(String[] args) throws Exception {
    14. //1.打开浏览器,创建Httpclient对象
    15. CloseableHttpClient httpClient = HttpClients.createDefault();
    16. //2.输入网址,发起get请求创建HttpGet对象
    17. HttpGet httpGet = new HttpGet("https://www.itcast.cn/");
    18. //3.按回车,发起请求,返回响应,使用Httpclient对象发起请求
    19. CloseableHttpResponse response = httpClient.execute(httpGet);
    20. //4.解析响应,获取数据
    21. //判斯状态码是否是200
    22. if (response.getStatusLine().getStatusCode()==200){
    23. HttpEntity httpEntity = response.getEntity();
    24. //获取前端静态页面
    25. String content = EntityUtils.toString(httpEntity,"utf8");
    26. System.out.println(content);
    27. }
    28. }
    29. }

    2.HttpClient---Get

    1. package com.itheima.reggie.utils;
    2. import org.apache.http.HttpEntity;
    3. import org.apache.http.client.methods.CloseableHttpResponse;
    4. import org.apache.http.client.methods.HttpGet;
    5. import org.apache.http.impl.client.CloseableHttpClient;
    6. import org.apache.http.impl.client.HttpClients;
    7. import org.apache.http.util.EntityUtils;
    8. import java.io.IOException;
    9. /**
    10. * @Author lpc
    11. * @Date 2024 03 12 00 23
    12. **/
    13. public class CrawlerFirst {
    14. public static void main(String[] args){
    15. //1.打开浏览器,创建Httpclient对象
    16. CloseableHttpClient httpClient = HttpClients.createDefault();
    17. //2.输入网址,发起get请求创建HttpGet对象
    18. HttpGet httpGet = new HttpGet("https://www.itcast.cn/");
    19. //3.按回车,发起请求,返回响应,使用Httpclient对象发起请求
    20. CloseableHttpResponse response = null;
    21. try {
    22. response = httpClient.execute(httpGet);
    23. //4.解析响应,获取数据
    24. //判斯状态码是否是200
    25. if (response.getStatusLine().getStatusCode()==200){
    26. HttpEntity httpEntity = response.getEntity();
    27. //获取前端静态页面
    28. String content = EntityUtils.toString(httpEntity,"utf8");
    29. System.out.println(content.length());
    30. }
    31. } catch (IOException e) {
    32. throw new RuntimeException(e);
    33. }finally {
    34. try {
    35. //关闭response
    36. response.close();
    37. } catch (IOException e) {
    38. throw new RuntimeException(e);
    39. }
    40. try {
    41. //关闭浏览器
    42. httpClient.close();
    43. } catch (IOException e) {
    44. throw new RuntimeException(e);
    45. }
    46. }
    47. }
    48. }

    2.1 修改成连接池

    1. package org.example;
    2. import org.apache.http.client.methods.CloseableHttpResponse;
    3. import org.apache.http.client.methods.HttpGet;
    4. import org.apache.http.impl.client.CloseableHttpClient;
    5. import org.apache.http.impl.client.HttpClients;
    6. import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
    7. import org.apache.http.util.EntityUtils;
    8. import java.io.IOException;
    9. /**
    10. * @Author lpc
    11. * @Date 2024 03 14 09 38
    12. **/
    13. public class Test {
    14. public static void main(String[] args) {
    15. //创建连接池
    16. PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
    17. //设置最大连接数
    18. cm.setMaxTotal(100);
    19. //设置每个主机的最大连接数
    20. cm.setDefaultMaxPerRoute(10);
    21. //使用连接池管理器发起请求
    22. doGet(cm);
    23. }
    24. public static void doGet(PoolingHttpClientConnectionManager cm){
    25. //不是每次创建新的httpClient,而是从连接池中获取HttpClient对象
    26. CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
    27. HttpGet httpGet = new HttpGet("http://www.itcast.cn");
    28. CloseableHttpResponse response=null;
    29. try {
    30. response = httpClient.execute(httpGet);
    31. if (response.getStatusLine().getStatusCode()==200){
    32. String content = EntityUtils.toString(response.getEntity(), "utf8");
    33. System.out.println(content.length());
    34. }
    35. } catch (IOException e) {
    36. throw new RuntimeException(e);
    37. }finally {
    38. if (response!=null){
    39. try {
    40. response.close();
    41. } catch (IOException e) {
    42. throw new RuntimeException(e);
    43. }
    44. //不能关闭,由连接池管理
    45. // httpClient.close();
    46. }
    47. }
    48. }
    49. }

    3.HttpClient---Get带参数

    1. package org.example;
    2. import org.apache.http.HttpEntity;
    3. import org.apache.http.client.methods.CloseableHttpResponse;
    4. import org.apache.http.client.methods.HttpGet;
    5. import org.apache.http.client.utils.URIBuilder;
    6. import org.apache.http.impl.client.CloseableHttpClient;
    7. import org.apache.http.impl.client.HttpClients;
    8. import org.apache.http.util.EntityUtils;
    9. import java.io.IOException;
    10. import java.net.URISyntaxException;
    11. /**
    12. * @Author lpc
    13. * @Date 2024 03 13 20 44
    14. **/
    15. public class Test2 {
    16. public static void main(String[] args) throws Exception {
    17. //1.打开浏览器
    18. CloseableHttpClient httpClient = HttpClients.createDefault();
    19. //设置请求地址是: http://yun.itheima.com/search?keys=Java
    20. //带参数的get方法设置
    21. //创建URIBuilder
    22. URIBuilder uriBuilder = new URIBuilder("http://yun.itheima.com/search");
    23. //设置参数 可以设置多个
    24. uriBuilder.setParameter("keys","Java");
    25. //2.输入网址,发起get请求创建HttpGet对象
    26. HttpGet httpGet = new HttpGet(uriBuilder.build());
    27. System.out.println("发起请求的信息"+httpGet);
    28. //3.
    29. CloseableHttpResponse response=null;
    30. try {
    31. response = httpClient.execute(httpGet);
    32. if (response.getStatusLine().getStatusCode()==200){
    33. HttpEntity httpEntity = response.getEntity();
    34. //
    35. String s = EntityUtils.toString(httpEntity, "utf8");
    36. System.out.println(s);
    37. }
    38. } catch (IOException e) {
    39. throw new RuntimeException(e);
    40. }finally {
    41. try {
    42. response.close();
    43. } catch (IOException e) {
    44. throw new RuntimeException(e);
    45. }
    46. try {
    47. httpClient.close();
    48. } catch (IOException e) {
    49. throw new RuntimeException(e);
    50. }
    51. }
    52. }
    53. }

    3.1 修改成连接池

    4.HttpClient---Post

    1. package org.example;
    2. import org.apache.http.HttpEntity;
    3. import org.apache.http.client.methods.CloseableHttpResponse;
    4. import org.apache.http.client.methods.HttpGet;
    5. import org.apache.http.client.methods.HttpPost;
    6. import org.apache.http.client.utils.URIBuilder;
    7. import org.apache.http.impl.client.CloseableHttpClient;
    8. import org.apache.http.impl.client.HttpClients;
    9. import org.apache.http.util.EntityUtils;
    10. import java.io.IOException;
    11. /**
    12. * @Author lpc
    13. * @Date 2024 03 13 20 59
    14. **/
    15. public class Post {
    16. public static void main(String[] args) {
    17. //1.打开浏览器
    18. CloseableHttpClient httpClient = HttpClients.createDefault();
    19. //2.输入网址,发起get请求创建HttpGet对象
    20. //HttpGet httpGet = new HttpGet("https://www.itcast.cn/");
    21. HttpPost httpPost = new HttpPost("https://www.itcast.cn/");
    22. //3.
    23. CloseableHttpResponse response=null;
    24. try {
    25. // response = httpClient.execute(httpGet);
    26. response = httpClient.execute(httpPost);
    27. if (response.getStatusLine().getStatusCode()==200){
    28. HttpEntity httpEntity = response.getEntity();
    29. //
    30. String s = EntityUtils.toString(httpEntity, "utf8");
    31. System.out.println(s);
    32. }
    33. } catch (IOException e) {
    34. throw new RuntimeException(e);
    35. }finally {
    36. try {
    37. response.close();
    38. } catch (IOException e) {
    39. throw new RuntimeException(e);
    40. }
    41. try {
    42. httpClient.close();
    43. } catch (IOException e) {
    44. throw new RuntimeException(e);
    45. }
    46. }
    47. }
    48. }

    4.1 修改成连接池

    1. package org.example;
    2. import org.apache.http.client.methods.CloseableHttpResponse;
    3. import org.apache.http.client.methods.HttpPost;
    4. import org.apache.http.impl.client.CloseableHttpClient;
    5. import org.apache.http.impl.client.HttpClients;
    6. import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
    7. import org.apache.http.util.EntityUtils;
    8. import java.io.IOException;
    9. /**
    10. * @Author lpc
    11. * @Date 2024 03 14 10 02
    12. **/
    13. public class Postl {
    14. public static void main(String[] args){
    15. //创建连接池管理器
    16. PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
    17. //设置最大连接数
    18. cm.setMaxTotal(100);
    19. //设置每个主机最大连接数
    20. cm.setDefaultMaxPerRoute(10);
    21. //发起请求
    22. doPost(cm);
    23. }
    24. private static void doPost(PoolingHttpClientConnectionManager cm) {
    25. //不是每次创建新的httpClient,而是从连接池中获取HttpClient对象
    26. CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
    27. //2.输入网址 发起Post请求
    28. HttpPost httpPost = new HttpPost("http://yun.itheima.com/search");
    29. CloseableHttpResponse response=null;
    30. try {
    31. response = httpClient.execute(httpPost);
    32. if (response.getStatusLine().getStatusCode()==200){
    33. String s = EntityUtils.toString(response.getEntity());
    34. System.out.println(s.length());
    35. }
    36. } catch (IOException e) {
    37. throw new RuntimeException(e);
    38. }finally {
    39. if (response!=null){
    40. try {
    41. response.close();
    42. } catch (IOException e) {
    43. throw new RuntimeException(e);
    44. }
    45. }
    46. //不用关闭,由连接池管理
    47. // httpClient.close();
    48. }
    49. }
    50. }

    5.HttpClient---Post带参数

    1. package org.example;
    2. import org.apache.http.HttpEntity;
    3. import org.apache.http.NameValuePair;
    4. import org.apache.http.client.entity.UrlEncodedFormEntity;
    5. import org.apache.http.client.methods.CloseableHttpResponse;
    6. import org.apache.http.client.methods.HttpGet;
    7. import org.apache.http.client.methods.HttpPost;
    8. import org.apache.http.client.utils.URIBuilder;
    9. import org.apache.http.impl.client.CloseableHttpClient;
    10. import org.apache.http.impl.client.HttpClients;
    11. import org.apache.http.message.BasicNameValuePair;
    12. import org.apache.http.util.EntityUtils;
    13. import java.io.IOException;
    14. import java.io.UnsupportedEncodingException;
    15. import java.util.ArrayList;
    16. import java.util.List;
    17. /**
    18. * @Author lpc
    19. * @Date 2024 03 13 20 59
    20. **/
    21. public class Post {
    22. public static void main(String[] args) throws Exception {
    23. //1.打开浏览器
    24. CloseableHttpClient httpClient = HttpClients.createDefault();
    25. //2.输入网址 发起Post请求
    26. HttpPost httpPost = new HttpPost("http://yun.itheima.com/search");
    27. //声明List集合,封装表单中的参数
    28. List params =new ArrayList();
    29. //设置请求地址是: http://yun.itheima.com/search?keys=Java
    30. params.add(new BasicNameValuePair("keys","Java"));
    31. //创建表单的Entity对象,第一个参数就是封装的表单数据,第二个参数就是编码
    32. UrlEncodedFormEntity urlEncodedFormEntity = new UrlEncodedFormEntity(params,"utf8");
    33. //设置表单的Entity对象到Post请求中
    34. httpPost.setEntity(urlEncodedFormEntity);
    35. CloseableHttpResponse response=null;
    36. try {
    37. // response = httpClient.execute(httpGet);
    38. response = httpClient.execute(httpPost);
    39. if (response.getStatusLine().getStatusCode()==200){
    40. HttpEntity httpEntity = response.getEntity();
    41. //
    42. String s = EntityUtils.toString(httpEntity, "utf8");
    43. System.out.println(s);
    44. }
    45. } catch (IOException e) {
    46. throw new RuntimeException(e);
    47. }finally {
    48. try {
    49. response.close();
    50. } catch (IOException e) {
    51. throw new RuntimeException(e);
    52. }
    53. try {
    54. httpClient.close();
    55. } catch (IOException e) {
    56. throw new RuntimeException(e);
    57. }
    58. }
    59. }
    60. }

    6.HttpClient-连接池

    如果每次请求都要创建HttpClient,会有频繁创建和销毁的问题,可以使用连接池来解决这个问题。·
    测试以下代码,并断点查看每次获取的HttpClient都是不一样的。。

    1. package org.example;
    2. import org.apache.http.client.methods.CloseableHttpResponse;
    3. import org.apache.http.client.methods.HttpGet;
    4. import org.apache.http.impl.client.CloseableHttpClient;
    5. import org.apache.http.impl.client.HttpClients;
    6. import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
    7. import org.apache.http.util.EntityUtils;
    8. import java.io.IOException;
    9. /**
    10. * @Author lpc
    11. * @Date 2024 03 14 09 38
    12. **/
    13. public class Test {
    14. public static void main(String[] args) {
    15. //创建连接池
    16. PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
    17. //设置最大连接数
    18. cm.setMaxTotal(100);
    19. //设置每个主机的最大连接数
    20. cm.setDefaultMaxPerRoute(10);
    21. //使用连接池管理器发起请求
    22. doGet(cm);
    23. }
    24. public static void doGet(PoolingHttpClientConnectionManager cm){
    25. //不是每次创建新的httpClient,而是从连接池中获取HttpClient对象
    26. CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
    27. HttpGet httpGet = new HttpGet("http://www.itcast.cn");
    28. CloseableHttpResponse response=null;
    29. try {
    30. response = httpClient.execute(httpGet);
    31. if (response.getStatusLine().getStatusCode()==200){
    32. String content = EntityUtils.toString(response.getEntity(), "utf8");
    33. System.out.println(content.length());
    34. }
    35. } catch (IOException e) {
    36. throw new RuntimeException(e);
    37. }finally {
    38. if (response!=null){
    39. try {
    40. response.close();
    41. } catch (IOException e) {
    42. throw new RuntimeException(e);
    43. }
    44. //不能关闭,由连接池管理
    45. // httpClient.close();
    46. }
    47. }
    48. }
    49. }

    7.设置请求信息
     

    1. package org.example;
    2. import org.apache.http.client.config.RequestConfig;
    3. import org.apache.http.client.methods.CloseableHttpResponse;
    4. import org.apache.http.client.methods.HttpGet;
    5. import org.apache.http.impl.client.CloseableHttpClient;
    6. import org.apache.http.impl.client.HttpClients;
    7. import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
    8. import org.apache.http.util.EntityUtils;
    9. import java.io.IOException;
    10. /**
    11. * @Author lpc
    12. * @Date 2024 03 14 09 38
    13. **/
    14. public class Test {
    15. public static void main(String[] args) {
    16. //创建连接池
    17. PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
    18. //设置最大连接数
    19. cm.setMaxTotal(100);
    20. //设置每个主机的最大连接数
    21. cm.setDefaultMaxPerRoute(10);
    22. //使用连接池管理器发起请求
    23. doGet(cm);
    24. }
    25. public static void doGet(PoolingHttpClientConnectionManager cm){
    26. //不是每次创建新的httpClient,而是从连接池中获取HttpClient对象
    27. CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
    28. HttpGet httpGet = new HttpGet("http://www.itcast.cn");
    29. //配置请求信息
    30. RequestConfig config=RequestConfig.custom().setConnectTimeout(1000) //创建连接的最长时间,单位是毫秒
    31. .setConnectionRequestTimeout(500)//设置获取连接的最长时间
    32. .setSocketTimeout(10*1000)//设置数据传输的最长时间
    33. .build();
    34. //给请求设置请求信息
    35. httpGet.setConfig(config);
    36. CloseableHttpResponse response=null;
    37. try {
    38. response = httpClient.execute(httpGet);
    39. if (response.getStatusLine().getStatusCode()==200){
    40. String content = EntityUtils.toString(response.getEntity(), "utf8");
    41. System.out.println(content.length());
    42. }
    43. } catch (IOException e) {
    44. throw new RuntimeException(e);
    45. }finally {
    46. if (response!=null){
    47. try {
    48. response.close();
    49. } catch (IOException e) {
    50. throw new RuntimeException(e);
    51. }
    52. //不能关闭,由连接池管理
    53. // httpClient.close();
    54. }
    55. }
    56. }
    57. }

    8.jsoup介绍.

    jsoup是一款Java 的 HTML解析器,可直接解析某个URL地址、HTML文木内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。.

    jsoup 的主要功能如下:

    1.从一个 URL,文件或字符串中解析HTML;

    2.使用DOM或CSS选择器来查找、取出数据;

    3.可操作HTML元素、属性、文本;·

    依赖

    1. <dependency>
    2. <groupId>org.jsoupgroupId>
    3. <artifactId>jsoupartifactId>
    4. <version>1.13.1version>
    5. dependency>
    6. <dependency>
    7. <groupId>junitgroupId>
    8. <artifactId>junitartifactId>
    9. <version>4.12version>
    10. <scope>testscope>
    11. dependency>
    12. <dependency>
    13. <groupId>commons-iogroupId>
    14. <artifactId>commons-ioartifactId>
    15. <version>2.4version>
    16. dependency>
    17. <dependency>
    18. <groupId>org.apache.commonsgroupId>
    19. <artifactId>commons-lang3artifactId>
    20. <version>3.8.1version>
    21. dependency>

    9.jsoup解析url

    1. package jsoup;
    2. import org.jsoup.Jsoup;
    3. import org.jsoup.nodes.Document;
    4. import org.jsoup.nodes.Element;
    5. import org.junit.Test;
    6. import java.net.MalformedURLException;
    7. import java.net.URL;
    8. /**
    9. * @Author lpc
    10. * @Date 2024 03 14 10 44
    11. **/
    12. public class jsoupTestFirst {
    13. @Test
    14. public void testJsoupUrl() throws Exception {
    15. //解析URL地址
    16. Document parse = Jsoup.parse(new URL("http://www.itcast.cn"), 10*1000);
    17. //获取title的内容
    18. Element title = parse.getElementsByTag("title").first();
    19. System.out.println(title.text());
    20. }
    21. }

    10.jsoup解析字符串

    1. package jsoup;
    2. import org.apache.commons.io.FileUtils;
    3. import org.jsoup.Jsoup;
    4. import org.jsoup.nodes.Document;
    5. import org.jsoup.nodes.Element;
    6. import org.junit.Test;
    7. import java.io.File;
    8. import java.net.MalformedURLException;
    9. import java.net.URL;
    10. /**
    11. * @Author lpc
    12. * @Date 2024 03 14 10 44
    13. **/
    14. public class jsoupTestFirst {
    15. @Test
    16. public void testString() throws Exception {
    17. //使用工具读取文件,获取字符串
    18. String file = FileUtils.readFileToString(new File("D:\\file.html"), "utf8");
    19. //解析字符串
    20. Document document = Jsoup.parse(file);
    21. //获取title的内容
    22. String title = document.getElementsByTag("title").first().text();
    23. System.out.println(title);
    24. }
    25. }

    11.jsoup解析文件

    1. @Test
    2. public void testFile() throws Exception {
    3. //解析文件
    4. Document parse = Jsoup.parse(new File("D:\\file.html"), "utf8");
    5. String title = parse.getElementsByTag("title").first().text();
    6. System.out.println(title);
    7. }

    12.所有dom方式获取元素

    1. @Test
    2. public void testDom() throws Exception {
    3. //解析文件,获取Document对象
    4. Document parse = Jsoup.parse(new File("D:\\file.html"), "utf8");
    5. //获取元素
    6. //1.
    7. //Element elementById = parse.getElementById("popupMenu");
    8. //2.
    9. //Element elementById=parse.getElementsByTag("span").first();
    10. //3.
    11. // Elements elementById = parse.getElementsByClass("city_nav");
    12. //4.
    13. Elements elementById=parse.getElementsByAttribute("abc");
    14. System.out.println(elementById.text());
    15. }

    13.元素中获取数据

    1. @Test
    2. public void testData() throws Exception {
    3. //解析文件
    4. Document parse = Jsoup.parse(new File("D:\\file.html"), "utf8");
    5. //根据id获取元素
    6. Element elementById = parse.getElementById("test");
    7. System.out.println(elementById);
    8. //1.从元素中获取id
    9. String str1=elementById.id();
    10. System.out.println(str1);
    11. //2.从元素中获取className
    12. String str2=elementById.className();
    13. System.out.println(str2);
    14. //3.从元素获取attr的值
    15. String str3=elementById.attr("id");
    16. System.out.println(str3);
    17. //4。从元素中获取所有属性
    18. Attributes attributes = elementById.attributes();
    19. System.out.println(attributes);
    20. //5.从元素中获取文本内容
    21. String str4=elementById.text();
    22. System.out.println(str4);
    23. }

  • 相关阅读:
    前端培训丁鹿学堂:vue3中setup语法糖特性写法总结
    java毕业设计游戏社区设计Mybatis+系统+数据库+调试部署
    数组累加和问题
    华为数通HCIA-地址分类及子网划分
    YOLOV7详细解读(一)网络架构解读
    在Springboot HandlerInterceptor中获取GET和POST请求参数
    算法基础实验OJ—树的遍历
    一站式元数据治理平台——Datahub
    如何禁止在堆上和栈上创建对象
    时空智友企业流程化管控系统任意文件上传漏洞复现【附POC】
  • 原文地址:https://blog.csdn.net/m0_59690068/article/details/136643758