java word文档转 html文件

用java将word转为html文档

1、简介
2、添加依赖
3、代码示例

1、简介

最近，因项目需要，需要对word文档进行解析拆分，感觉直接解析word有点麻烦，于是想到，先将word转为html文档，然后用jsoup解析html文件更方便，jsoup的使用可以参考相关API,操作简单，类似javascript。
word转html文档方法如下：

一般word文件后缀有doc、docx两种。docx是office word 2007以及以后版本文档的扩展名；doc是office word 2003文档保存的扩展名。对于这两种格式的word转换成html需要使用不同的方法。

2、添加依赖

<dependency>
    <groupId>fr.opensagres.xdocreportgroupId>
<artifactId>fr.opensagres.xdocreport.documentartifactId>
    <version>1.0.5version>
dependency>
<dependency>  
    <groupId>fr.opensagres.xdocreportgroupId>  
   <artifactId>org.apache.poi.xwpf.converter.xhtmlartifactId>  
    <version>1.0.5version>  
dependency>
1
2
3
4
5
6
7
8
9
10

对于docx格式的文档使用poi进行转换。依赖如下：

<dependency>
    <groupId>org.apache.poigroupId>
    <artifactId>poiartifactId>
    <version>3.12version>
dependency>
<dependency>
    <groupId>org.apache.poigroupId>
    <artifactId>poi-scratchpadartifactId>
    <version>3.12version>
dependency>
1
2
3
4
5
6
7
8
9
10

3、代码示例

package com.test.word;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.core.FileURIResolver;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.junit.Test;
import org.w3c.dom.Document;

/**
 * word 转换成html
 */
public class WordToHtml {

    /**
     * 2007版本word转换成html
     * @throws IOException
     */
    @Test
    public void Word2007ToHtml() throws IOException {
        String filepath = "C:/test/";
        String fileName = "滕王阁序2007.docx";
        String htmlName = "滕王阁序2007.html";
        final String file = filepath + fileName;
        File f = new File(file);
        if (!f.exists()) {
            System.out.println("Sorry File does not Exists!");
        } else {
            if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) {

                // 1) 加载word文档生成 XWPFDocument对象
                InputStream in = new FileInputStream(f);
                XWPFDocument document = new XWPFDocument(in);

                // 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)
                File imageFolderFile = new File(filepath);
                XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile));
                options.setExtractor(new FileImageExtractor(imageFolderFile));
                options.setIgnoreStylesIfUnused(false);
                options.setFragment(true);

                // 3) 将 XWPFDocument转换成XHTML
                OutputStream out = new FileOutputStream(new File(filepath + htmlName));
                XHTMLConverter.getInstance().convert(document, out, options);

                //也可以使用字符数组流获取解析的内容
//                ByteArrayOutputStream baos = new ByteArrayOutputStream();
//                XHTMLConverter.getInstance().convert(document, baos, options);
//                String content = baos.toString();
//                System.out.println(content);
//                 baos.close();
            } else {
                System.out.println("Enter only MS Office 2007+ files");
            }
        }
    }

    /**
     * /**
     * 2003版本word转换成html
     * @throws IOException
     * @throws TransformerException
     * @throws ParserConfigurationException
     */
    @Test
    public void Word2003ToHtml() throws IOException, TransformerException, ParserConfigurationException {
        String filepath = "C:/test/";
        final String imagepath = "C:/test/image/";
        String fileName = "滕王阁序2003.doc";
        String htmlName = "滕王阁序2003.html";
        final String file = filepath + fileName;
        InputStream input = new FileInputStream(new File(file));
        HWPFDocument wordDocument = new HWPFDocument(input);
        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
        //设置图片存放的位置
        wordToHtmlConverter.setPicturesManager(new PicturesManager() {
            public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
                File imgPath = new File(imagepath);
                if(!imgPath.exists()){//图片目录不存在则创建
                    imgPath.mkdirs();
                }
                File file = new File(imagepath + suggestedName);
                try {
                    OutputStream os = new FileOutputStream(file);
                    os.write(content);
                    os.close();
                } catch (FileNotFoundException e) {
                    e.printStackTrace();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                return imagepath + suggestedName;
            }
        });

        //解析word文档
        wordToHtmlConverter.processDocument(wordDocument);
        Document htmlDocument = wordToHtmlConverter.getDocument();

        File htmlFile = new File(filepath + htmlName);
        OutputStream outStream = new FileOutputStream(htmlFile);

        //也可以使用字符数组流获取解析的内容
//        ByteArrayOutputStream baos = new ByteArrayOutputStream();
//        OutputStream outStream = new BufferedOutputStream(baos);

        DOMSource domSource = new DOMSource(htmlDocument);
        StreamResult streamResult = new StreamResult(outStream);

        TransformerFactory factory = TransformerFactory.newInstance();
        Transformer serializer = factory.newTransformer();
        serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");

        serializer.transform(domSource, streamResult);

        //也可以使用字符数组流获取解析的内容
//        String content = baos.toString();
//        System.out.println(content);
//        baos.close();
        outStream.close();
    }
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146

运行结果：
在这里插入图片描述

原文参考：https://www.cnblogs.com/always-online/p/4800131.html

相关阅读:
node js AES加密
 电脑案件冲突问题
 053基于web+springboot的宠物咖啡馆平台的设计与实现
 做项目管理需要哪些技能？
有人说考个PMP证两个星期搞定？
5.2 基于ROP漏洞挖掘与利用
 Vue+NodeJS+MongoDB实现邮箱验证注册、登录
 水库大坝安全监测预警系统的重要作用
 （王道考研计算机网络）第五章传输层-第三节1-2：TCP协议特点和TCP报文段
 java计算机毕业设计健康医疗预约系统源码+mysql数据库+系统+lw文档+部署
原文地址：https://blog.csdn.net/zch981964/article/details/132827451

java word文档 转 html文件

用java将word转为html文档

1、简介

2、添加依赖

3、代码示例

java word文档转 html文件