目录
导入依赖
- <dependency>
- <groupId>org.apache.httpcomponentsgroupId>
- <artifactId>httpclientartifactId>
- <version>4.5.2version>
- dependency>
- logging:
- level:
- root: info
- com.lrm: debug
- package com.itheima.reggie.utils;
-
-
- import org.apache.http.HttpEntity;
- import org.apache.http.client.methods.CloseableHttpResponse;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.impl.client.CloseableHttpClient;
- import org.apache.http.impl.client.HttpClients;
- import org.apache.http.util.EntityUtils;
-
- import java.io.IOException;
-
- /**
- * @Author lpc
- **/
- public class CrawlerFirst {
- public static void main(String[] args) throws Exception {
-
- //1.打开浏览器,创建Httpclient对象
- CloseableHttpClient httpClient = HttpClients.createDefault();
-
- //2.输入网址,发起get请求创建HttpGet对象
- HttpGet httpGet = new HttpGet("https://www.itcast.cn/");
-
- //3.按回车,发起请求,返回响应,使用Httpclient对象发起请求
- CloseableHttpResponse response = httpClient.execute(httpGet);
- //4.解析响应,获取数据
- //判斯状态码是否是200
- if (response.getStatusLine().getStatusCode()==200){
- HttpEntity httpEntity = response.getEntity();
- //获取前端静态页面
- String content = EntityUtils.toString(httpEntity,"utf8");
- System.out.println(content);
- }
-
-
- }
- }
- package com.itheima.reggie.utils;
-
-
- import org.apache.http.HttpEntity;
- import org.apache.http.client.methods.CloseableHttpResponse;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.impl.client.CloseableHttpClient;
- import org.apache.http.impl.client.HttpClients;
- import org.apache.http.util.EntityUtils;
-
- import java.io.IOException;
-
- /**
- * @Author lpc
- * @Date 2024 03 12 00 23
- **/
- public class CrawlerFirst {
- public static void main(String[] args){
-
- //1.打开浏览器,创建Httpclient对象
- CloseableHttpClient httpClient = HttpClients.createDefault();
-
- //2.输入网址,发起get请求创建HttpGet对象
- HttpGet httpGet = new HttpGet("https://www.itcast.cn/");
-
- //3.按回车,发起请求,返回响应,使用Httpclient对象发起请求
- CloseableHttpResponse response = null;
-
- try {
- response = httpClient.execute(httpGet);
- //4.解析响应,获取数据
- //判斯状态码是否是200
- if (response.getStatusLine().getStatusCode()==200){
- HttpEntity httpEntity = response.getEntity();
- //获取前端静态页面
- String content = EntityUtils.toString(httpEntity,"utf8");
- System.out.println(content.length());
- }
- } catch (IOException e) {
- throw new RuntimeException(e);
- }finally {
- try {
- //关闭response
- response.close();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- try {
- //关闭浏览器
- httpClient.close();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
-
-
- }
- }
- package org.example;
-
- import org.apache.http.client.methods.CloseableHttpResponse;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.impl.client.CloseableHttpClient;
- import org.apache.http.impl.client.HttpClients;
- import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
- import org.apache.http.util.EntityUtils;
-
- import java.io.IOException;
-
- /**
- * @Author lpc
- * @Date 2024 03 14 09 38
- **/
- public class Test {
-
- public static void main(String[] args) {
- //创建连接池
- PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
- //设置最大连接数
- cm.setMaxTotal(100);
- //设置每个主机的最大连接数
- cm.setDefaultMaxPerRoute(10);
- //使用连接池管理器发起请求
- doGet(cm);
- }
-
- public static void doGet(PoolingHttpClientConnectionManager cm){
- //不是每次创建新的httpClient,而是从连接池中获取HttpClient对象
- CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
- HttpGet httpGet = new HttpGet("http://www.itcast.cn");
- CloseableHttpResponse response=null;
- try {
- response = httpClient.execute(httpGet);
- if (response.getStatusLine().getStatusCode()==200){
- String content = EntityUtils.toString(response.getEntity(), "utf8");
- System.out.println(content.length());
- }
- } catch (IOException e) {
- throw new RuntimeException(e);
- }finally {
- if (response!=null){
- try {
- response.close();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- //不能关闭,由连接池管理
- // httpClient.close();
- }
- }
-
- }
- }
- package org.example;
-
- import org.apache.http.HttpEntity;
- import org.apache.http.client.methods.CloseableHttpResponse;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.client.utils.URIBuilder;
- import org.apache.http.impl.client.CloseableHttpClient;
- import org.apache.http.impl.client.HttpClients;
- import org.apache.http.util.EntityUtils;
-
- import java.io.IOException;
- import java.net.URISyntaxException;
-
- /**
- * @Author lpc
- * @Date 2024 03 13 20 44
- **/
- public class Test2 {
- public static void main(String[] args) throws Exception {
-
- //1.打开浏览器
- CloseableHttpClient httpClient = HttpClients.createDefault();
- //设置请求地址是: http://yun.itheima.com/search?keys=Java
- //带参数的get方法设置
- //创建URIBuilder
- URIBuilder uriBuilder = new URIBuilder("http://yun.itheima.com/search");
- //设置参数 可以设置多个
- uriBuilder.setParameter("keys","Java");
-
- //2.输入网址,发起get请求创建HttpGet对象
- HttpGet httpGet = new HttpGet(uriBuilder.build());
- System.out.println("发起请求的信息"+httpGet);
- //3.
- CloseableHttpResponse response=null;
- try {
- response = httpClient.execute(httpGet);
- if (response.getStatusLine().getStatusCode()==200){
- HttpEntity httpEntity = response.getEntity();
- //
- String s = EntityUtils.toString(httpEntity, "utf8");
- System.out.println(s);
- }
- } catch (IOException e) {
- throw new RuntimeException(e);
- }finally {
- try {
- response.close();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- try {
- httpClient.close();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
- }
- }
- package org.example;
-
- import org.apache.http.HttpEntity;
- import org.apache.http.client.methods.CloseableHttpResponse;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.client.methods.HttpPost;
- import org.apache.http.client.utils.URIBuilder;
- import org.apache.http.impl.client.CloseableHttpClient;
- import org.apache.http.impl.client.HttpClients;
- import org.apache.http.util.EntityUtils;
-
- import java.io.IOException;
-
- /**
- * @Author lpc
- * @Date 2024 03 13 20 59
- **/
- public class Post {
- public static void main(String[] args) {
-
- //1.打开浏览器
- CloseableHttpClient httpClient = HttpClients.createDefault();
- //2.输入网址,发起get请求创建HttpGet对象
- //HttpGet httpGet = new HttpGet("https://www.itcast.cn/");
- HttpPost httpPost = new HttpPost("https://www.itcast.cn/");
- //3.
- CloseableHttpResponse response=null;
- try {
- // response = httpClient.execute(httpGet);
- response = httpClient.execute(httpPost);
- if (response.getStatusLine().getStatusCode()==200){
- HttpEntity httpEntity = response.getEntity();
- //
- String s = EntityUtils.toString(httpEntity, "utf8");
- System.out.println(s);
- }
- } catch (IOException e) {
- throw new RuntimeException(e);
- }finally {
- try {
- response.close();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- try {
- httpClient.close();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
- }
- }
- package org.example;
-
- import org.apache.http.client.methods.CloseableHttpResponse;
- import org.apache.http.client.methods.HttpPost;
- import org.apache.http.impl.client.CloseableHttpClient;
- import org.apache.http.impl.client.HttpClients;
- import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
- import org.apache.http.util.EntityUtils;
-
- import java.io.IOException;
-
- /**
- * @Author lpc
- * @Date 2024 03 14 10 02
- **/
- public class Postl {
- public static void main(String[] args){
-
- //创建连接池管理器
- PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
- //设置最大连接数
- cm.setMaxTotal(100);
- //设置每个主机最大连接数
- cm.setDefaultMaxPerRoute(10);
- //发起请求
- doPost(cm);
- }
-
- private static void doPost(PoolingHttpClientConnectionManager cm) {
- //不是每次创建新的httpClient,而是从连接池中获取HttpClient对象
- CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
- //2.输入网址 发起Post请求
- HttpPost httpPost = new HttpPost("http://yun.itheima.com/search");
- CloseableHttpResponse response=null;
- try {
- response = httpClient.execute(httpPost);
- if (response.getStatusLine().getStatusCode()==200){
- String s = EntityUtils.toString(response.getEntity());
- System.out.println(s.length());
- }
- } catch (IOException e) {
- throw new RuntimeException(e);
- }finally {
- if (response!=null){
- try {
- response.close();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
-
- }
- //不用关闭,由连接池管理
- // httpClient.close();
- }
- }
- }
- package org.example;
-
- import org.apache.http.HttpEntity;
- import org.apache.http.NameValuePair;
- import org.apache.http.client.entity.UrlEncodedFormEntity;
- import org.apache.http.client.methods.CloseableHttpResponse;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.client.methods.HttpPost;
- import org.apache.http.client.utils.URIBuilder;
- import org.apache.http.impl.client.CloseableHttpClient;
- import org.apache.http.impl.client.HttpClients;
- import org.apache.http.message.BasicNameValuePair;
- import org.apache.http.util.EntityUtils;
-
- import java.io.IOException;
- import java.io.UnsupportedEncodingException;
- import java.util.ArrayList;
- import java.util.List;
-
- /**
- * @Author lpc
- * @Date 2024 03 13 20 59
- **/
- public class Post {
- public static void main(String[] args) throws Exception {
-
- //1.打开浏览器
- CloseableHttpClient httpClient = HttpClients.createDefault();
- //2.输入网址 发起Post请求
- HttpPost httpPost = new HttpPost("http://yun.itheima.com/search");
- //声明List集合,封装表单中的参数
- List
params =new ArrayList(); - //设置请求地址是: http://yun.itheima.com/search?keys=Java
- params.add(new BasicNameValuePair("keys","Java"));
- //创建表单的Entity对象,第一个参数就是封装的表单数据,第二个参数就是编码
- UrlEncodedFormEntity urlEncodedFormEntity = new UrlEncodedFormEntity(params,"utf8");
-
- //设置表单的Entity对象到Post请求中
- httpPost.setEntity(urlEncodedFormEntity);
-
- CloseableHttpResponse response=null;
- try {
- // response = httpClient.execute(httpGet);
- response = httpClient.execute(httpPost);
- if (response.getStatusLine().getStatusCode()==200){
- HttpEntity httpEntity = response.getEntity();
- //
- String s = EntityUtils.toString(httpEntity, "utf8");
- System.out.println(s);
- }
- } catch (IOException e) {
- throw new RuntimeException(e);
- }finally {
- try {
- response.close();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- try {
- httpClient.close();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
- }
- }
如果每次请求都要创建HttpClient,会有频繁创建和销毁的问题,可以使用连接池来解决这个问题。·
测试以下代码,并断点查看每次获取的HttpClient都是不一样的。。
- package org.example;
-
- import org.apache.http.client.methods.CloseableHttpResponse;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.impl.client.CloseableHttpClient;
- import org.apache.http.impl.client.HttpClients;
- import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
- import org.apache.http.util.EntityUtils;
-
- import java.io.IOException;
-
- /**
- * @Author lpc
- * @Date 2024 03 14 09 38
- **/
- public class Test {
-
- public static void main(String[] args) {
- //创建连接池
- PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
- //设置最大连接数
- cm.setMaxTotal(100);
- //设置每个主机的最大连接数
- cm.setDefaultMaxPerRoute(10);
- //使用连接池管理器发起请求
- doGet(cm);
- }
-
- public static void doGet(PoolingHttpClientConnectionManager cm){
- //不是每次创建新的httpClient,而是从连接池中获取HttpClient对象
- CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
- HttpGet httpGet = new HttpGet("http://www.itcast.cn");
- CloseableHttpResponse response=null;
- try {
- response = httpClient.execute(httpGet);
- if (response.getStatusLine().getStatusCode()==200){
- String content = EntityUtils.toString(response.getEntity(), "utf8");
- System.out.println(content.length());
- }
- } catch (IOException e) {
- throw new RuntimeException(e);
- }finally {
- if (response!=null){
- try {
- response.close();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- //不能关闭,由连接池管理
- // httpClient.close();
- }
- }
-
- }
- }
- package org.example;
-
- import org.apache.http.client.config.RequestConfig;
- import org.apache.http.client.methods.CloseableHttpResponse;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.impl.client.CloseableHttpClient;
- import org.apache.http.impl.client.HttpClients;
- import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
- import org.apache.http.util.EntityUtils;
-
- import java.io.IOException;
-
- /**
- * @Author lpc
- * @Date 2024 03 14 09 38
- **/
- public class Test {
-
- public static void main(String[] args) {
- //创建连接池
- PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
- //设置最大连接数
- cm.setMaxTotal(100);
- //设置每个主机的最大连接数
- cm.setDefaultMaxPerRoute(10);
- //使用连接池管理器发起请求
- doGet(cm);
- }
-
- public static void doGet(PoolingHttpClientConnectionManager cm){
- //不是每次创建新的httpClient,而是从连接池中获取HttpClient对象
- CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
- HttpGet httpGet = new HttpGet("http://www.itcast.cn");
- //配置请求信息
- RequestConfig config=RequestConfig.custom().setConnectTimeout(1000) //创建连接的最长时间,单位是毫秒
- .setConnectionRequestTimeout(500)//设置获取连接的最长时间
- .setSocketTimeout(10*1000)//设置数据传输的最长时间
- .build();
- //给请求设置请求信息
- httpGet.setConfig(config);
- CloseableHttpResponse response=null;
- try {
- response = httpClient.execute(httpGet);
- if (response.getStatusLine().getStatusCode()==200){
- String content = EntityUtils.toString(response.getEntity(), "utf8");
- System.out.println(content.length());
- }
- } catch (IOException e) {
- throw new RuntimeException(e);
- }finally {
- if (response!=null){
- try {
- response.close();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- //不能关闭,由连接池管理
- // httpClient.close();
- }
- }
-
- }
- }
jsoup是一款Java 的 HTML解析器,可直接解析某个URL地址、HTML文木内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。.
jsoup 的主要功能如下:
1.从一个 URL,文件或字符串中解析HTML;
2.使用DOM或CSS选择器来查找、取出数据;
3.可操作HTML元素、属性、文本;·
依赖
- <dependency>
- <groupId>org.jsoupgroupId>
- <artifactId>jsoupartifactId>
- <version>1.13.1version>
- dependency>
-
- <dependency>
- <groupId>junitgroupId>
- <artifactId>junitartifactId>
- <version>4.12version>
- <scope>testscope>
- dependency>
-
-
- <dependency>
- <groupId>commons-iogroupId>
- <artifactId>commons-ioartifactId>
- <version>2.4version>
- dependency>
-
-
- <dependency>
- <groupId>org.apache.commonsgroupId>
- <artifactId>commons-lang3artifactId>
- <version>3.8.1version>
- dependency>
- package jsoup;
-
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.junit.Test;
-
- import java.net.MalformedURLException;
- import java.net.URL;
-
- /**
- * @Author lpc
- * @Date 2024 03 14 10 44
- **/
- public class jsoupTestFirst {
-
- @Test
- public void testJsoupUrl() throws Exception {
- //解析URL地址
- Document parse = Jsoup.parse(new URL("http://www.itcast.cn"), 10*1000);
-
- //获取title的内容
- Element title = parse.getElementsByTag("title").first();
- System.out.println(title.text());
-
- }
-
-
-
- }
- package jsoup;
-
- import org.apache.commons.io.FileUtils;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.junit.Test;
-
- import java.io.File;
- import java.net.MalformedURLException;
- import java.net.URL;
-
- /**
- * @Author lpc
- * @Date 2024 03 14 10 44
- **/
- public class jsoupTestFirst {
-
- @Test
- public void testString() throws Exception {
- //使用工具读取文件,获取字符串
- String file = FileUtils.readFileToString(new File("D:\\file.html"), "utf8");
- //解析字符串
- Document document = Jsoup.parse(file);
- //获取title的内容
- String title = document.getElementsByTag("title").first().text();
- System.out.println(title);
-
- }
-
-
-
- }
- @Test
- public void testFile() throws Exception {
- //解析文件
- Document parse = Jsoup.parse(new File("D:\\file.html"), "utf8");
- String title = parse.getElementsByTag("title").first().text();
- System.out.println(title);
-
- }

- @Test
- public void testDom() throws Exception {
- //解析文件,获取Document对象
- Document parse = Jsoup.parse(new File("D:\\file.html"), "utf8");
- //获取元素
- //1.
- //Element elementById = parse.getElementById("popupMenu");
- //2.
- //Element elementById=parse.getElementsByTag("span").first();
- //3.
- // Elements elementById = parse.getElementsByClass("city_nav");
- //4.
- Elements elementById=parse.getElementsByAttribute("abc");
-
- System.out.println(elementById.text());
-
- }


- @Test
- public void testData() throws Exception {
- //解析文件
- Document parse = Jsoup.parse(new File("D:\\file.html"), "utf8");
- //根据id获取元素
- Element elementById = parse.getElementById("test");
- System.out.println(elementById);
- //1.从元素中获取id
- String str1=elementById.id();
- System.out.println(str1);
- //2.从元素中获取className
- String str2=elementById.className();
- System.out.println(str2);
- //3.从元素获取attr的值
- String str3=elementById.attr("id");
- System.out.println(str3);
- //4。从元素中获取所有属性
- Attributes attributes = elementById.attributes();
- System.out.println(attributes);
- //5.从元素中获取文本内容
- String str4=elementById.text();
- System.out.println(str4);
-
- }