Word 文档转换 PDF、图片

工作有需要 Word 文档转换 PDF、图片的场景，我们来看看 Java 开发中怎么解决这个问题的。

Word 转 PDF

Word 转 PDF 分为商用 Aspose 方案和开源 Apache POI+iText 方案。

Aspose 方案

这种方式在目前来看应该是最好的，无论是转换的速度还是成功的概率，还支持的文件类型。

由于 Aspose 并非开源软件，不会在 Maven 公开依赖，故我们要手动加入到 Maven 管理中去。


<dependency>
    <groupId>com.asposegroupId>
    <artifactId>aspose-wordsartifactId>
    <version>15.8version>
    <scope>systemscope>
    <systemPath>${project.basedir}/jar/aspose-words-15.8.0-jdk16.jarsystemPath>
dependency>
1
2
3
4
5
6
7
8

添加依赖

因为是手动添加的包，MANIFEST.MF 也要加入，不然启动程序的时候不知道要加入这个 jar 包。增加一个manifestEntries节点：

<manifestEntries>
    
    <Class-Path>lib/aspose-words-15.8.jarClass-Path>
manifestEntries>
1
2
3
4

新增于 pom.xml 的位置如图：
在这里插入图片描述
拷贝 jar 包，除了 runtime 的还有刚新加的 system 包，新增一个copy-dependencies2：

<execution>
     <id>copy-dependencies2id>
     <phase>packagephase>
     <goals>
         <goal>copy-dependenciesgoal>
     goals>
     <configuration>
         <outputDirectory>${project.build.directory}/liboutputDirectory>
         <includeScope>systemincludeScope>
     configuration>
 execution>
1
2
3
4
5
6
7
8
9
10
11

新增于 pom.xml 的位置如图：
在这里插入图片描述

转换程序

import com.aspose.words.Document;
import com.aspose.words.ImageSaveOptions;
import com.aspose.words.License;
import com.aspose.words.SaveFormat;

import java.io.ByteArrayInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;

/**
 * ...
 * ...
 */
public class AsposeUtil {
    /**
     * Word 转 PDF
     *
     * @param wordPath Word 路径
     * @param pdfPath  PDF 路径
     */
    public static void word2pdf(String wordPath, String pdfPath) {
        AsposeUtil.getLicense();

        try (FileOutputStream os = new FileOutputStream(pdfPath)) {
            long old = System.currentTimeMillis();

            //设置一个字体目录（必须设置，否则生成的pdf乱码）下面这行代码不加的话在windows系统下生成的pdf不存在乱码问题，但是在linux系统下会乱码，linux下乱码解决方案请看后面的解决方案
            //FontSettings.setFontsFolder("/usr/share/fonts/chinese", false);
            new Document(wordPath).save(os, SaveFormat.PDF);

            System.out.println("word2pdf共耗时：" + (System.currentTimeMillis() - old) / 1000.0 + "秒");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static void word2img(String wordPath, String outputDir) {
        AsposeUtil.getLicense();

        try {
            long old = System.currentTimeMillis();
            Document doc = new Document(wordPath);

            // 创建图像保存选项对象
            ImageSaveOptions options = new ImageSaveOptions(SaveFormat.JPEG);
            options.setPageCount(doc.getPageCount()); // 设置要转换的页数
//            options.setResolution(300); // 设置图像分辨率，默认为96dpi

            // 逐页转换并保存为图像
            for (int pageIndex = 0; pageIndex < doc.getPageCount(); pageIndex++) {
                String outputFileName = outputDir + "image_" + (pageIndex + 1) + ".png";
                options.setPageIndex(pageIndex);
                doc.save(outputFileName, options);
            }

            System.out.println("word2pdf共耗时：" + (System.currentTimeMillis() - old) / 1000.0 + "秒");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    private static final byte[] LICENSE = ("\n" +
            "    \n" +
            "        \n" +
            "            Aspose.Total for Java\n" +
            "            Aspose.Words for Java\n" +
            "        \n" +
            "        Enterprise\n" +
            "        20991231\n" +
            "        20991231\n" +
            "        8bfe198c-7f0c-4ef8-8ff0-acc3237bf0d7\n" +
            "    \n" +
            "    sNLLKGMUdF0r8O1kKilWAGdgfs2BvJb/2Xp8p5iuDVfZXmhppo+d0Ran1P9TKdjV4ABwAgKXxJ3jcQTqE/2IRfqwnPf8itN8aFZlV3TJPYeD3yWE7IT55Gz6EijUpC7aKeoohTb4w2fpox58wWoF3SNp6sK6jDfiAUGEHYJ9pjU=\n" +
            "").getBytes();

    /**
     * 判断是否有授权文件 如果没有则会认为是试用版，转换的文件会有水印
     */
    public static void getLicense() {
        try (InputStream is = new ByteArrayInputStream(LICENSE)) {
            new License().setLicense(is);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87

Apache ——iText 方案


<dependency>
    <groupId>fr.opensagres.xdocreportgroupId>
    <artifactId>org.apache.poi.xwpf.converter.pdfartifactId>
    <version>1.0.6version>
dependency>
1
2
3
4
5
6

转换程序

import fr.opensagres.xdocreport.utils.StringUtils;
import org.apache.poi.xwpf.converter.pdf.PdfConverter;
import org.apache.poi.xwpf.converter.pdf.PdfOptions;
import org.apache.poi.xwpf.usermodel.*;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.List;
import java.util.Map;

/**
 * @author Rocca
 */
public class WordPdfUtils {

    /**
     * 将word文档， 转换成pdf, 中间替换掉变量
     *
     * @param source 源为word文档， 必须为docx文档
     * @param target 目标输出
     * @param params 需要替换的变量
     */
    public static void wordConverterToPdf(InputStream source, OutputStream target, Map<String, String> params) {
        wordConverterToPdf(source, target, null, params);
    }

    /**
     * 将word文档， 转换成pdf, 中间替换掉变量
     *
     * @param source  源为word文档， 必须为docx文档
     * @param target  目标输出
     * @param params  需要替换的变量
     * @param options PdfOptions.create().fontEncoding( "windows-1250" ) 或者其他
     */
    public static void wordConverterToPdf(InputStream source, OutputStream target, PdfOptions options, Map<String, String> params) {
        long old = System.currentTimeMillis();

        try {
            XWPFDocument doc = new XWPFDocument(source);
            paragraphReplace(doc.getParagraphs(), params);

            for (XWPFTable table : doc.getTables()) {
                for (XWPFTableRow row : table.getRows()) {
                    for (XWPFTableCell cell : row.getTableCells())
                        paragraphReplace(cell.getParagraphs(), params);
                }
            }

            PdfConverter.getInstance().convert(doc, target, options);
            System.out.println("word2pdf共耗时：" + (System.currentTimeMillis() - old) / 1000.0 + "秒");
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * 替换段落中内容
     */
    private static void paragraphReplace(List<XWPFParagraph> paragraphs, Map<String, String> params) {
        for (XWPFParagraph p : paragraphs) {
            for (XWPFRun r : p.getRuns()) {
                String content = r.getText(r.getTextPosition());
                if (StringUtils.isNotEmpty(content) && params.containsKey(content)) r.setText(params.get(content), 0);
            }
        }
    }

}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69

PDF 转图片

上述的 Aspose.Word 并不支持 PDF 转图片。要使用 Aspose PDF 转图片须使用他家的另外一个产品 Aspose.Pdf。另外有趣的是，Aspose.Word 可以直接转为图片，但由于当前需求是得到了 Pdf 加盖章和签名之后转换图片的，并不能从 Word 直接转图片。而且感觉 Word 转图片也比较慢。

我感觉 PDF 转图片比较简单，不用 Aspose 也行，——于是使用了 Apache 的 Pdfbox。


<dependency>
    <groupId>org.apache.pdfboxgroupId>
    <artifactId>pdfboxartifactId>
    <version>3.0.0version>
dependency>
1
2
3
4
5
6

你可以调整 DPI 分辨率，跟图片格式，下面例子是 gif 的。

/**
 * PDF 转图片
 *
 * @param pdfFile PDF 文件
 */
public static void pdf2Img(String pdfFile, String outputDir) {
    long old = System.currentTimeMillis();

    try (PDDocument document = Loader.loadPDF(new File(pdfFile))) {
        PDFRenderer renderer = new PDFRenderer(document);

        for (int i = 0; i < document.getNumberOfPages(); ++i) {
            ByteArrayOutputStream out = new ByteArrayOutputStream();
            ImageIO.write(renderer.renderImageWithDPI(i, DPI), "gif", out);

            // 将字节数组写入到文件
            try (FileOutputStream fos = new FileOutputStream(outputDir + FileHelper.SEPARATOR + "img-" + i + ".gif")) {
                fos.write(out.toByteArray());
            }
        }

        System.out.println("pdf2img共耗时：" + (System.currentTimeMillis() - old) / 1000.0 + "秒");
    } catch (IOException e) {
        e.printStackTrace();
    }
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

一般一份 PDF 是多页的，于是也会输出多张图片。所以你可以修改里面的文件名生成规则。

相关阅读:
安装speccpu2006时报错
 redis配置文件详情
 Base64编码与解码
 vs2019_qt6.2.4_dcmtk3.6.7_vtk9.2.2_itk5.3_opencv4.6.0编译记录
 Django TypeError: Abstract models cannot be instantiated.错误解决方案
 美国生活经济学 -- 读书笔记
 统计学考研笔记：季度指数
 【FDTD 反射、透射、吸收软件操作】
改进的PSO-BP算法在工业机器人末端位姿误差补偿中的应用
 网易数帆黄久远：大规模Kubernetes监控体系建设之路
原文地址：https://blog.csdn.net/zhangxin09/article/details/132998249