开发手账（一）

一、关于设计

（一）数据库

确定外键标识，需判断该外键是否有可能被修改。如菜单id，菜单code，菜单名，前两者都可做外键，后面一个则不应做外键。

二、关于组件

（一）POI

1. 文档页数统计

import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.ofdrw.reader.OFDReader;
import org.springframework.web.multipart.MultipartFile;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.nio.file.Paths;
@Slf4j
public class LvDocPageCounter {
    public static final String DOCUMENT_PAGE_TEMP = "DOCUMENT_PAGE_TEMP";
    public static int getPageCount(String filePath) {
        String fileType = getFileType(filePath);
        try {
            switch (fileType) {
                case "pdf":
                    return getPdfPageCount(filePath);
                case "docx":
                    return getDocxPageCount(filePath);
                case "doc":
                    return getDocPageCount(filePath);
                case "ofd":
                    return getOfdPageCount(filePath);
                // Add more cases for other document types as needed
                default:
                    log.warn("不支持的文件类型:{}", filePath);
                    return 1;
//                throw new IllegalArgumentException("Unsupported file type");
            }
        } catch (Exception e) {
            log.warn("读取文件异常:{},{}", filePath,e);
            return 0;
        }
    }

    /**
     * 文件类型
     * @param filePath
     * @return
     */
    private static String getFileType(String filePath) {
        int dotIndex = filePath.lastIndexOf('.');
        if (dotIndex == -1 || dotIndex == filePath.length() - 1) {
            log.warn("文件名中没有找到扩展名:{}", filePath);
            return "";
        }
        return filePath.substring(dotIndex + 1).toLowerCase();
    }
    /**
     * 获取PDF文档页数
     * @param filePath
     * @return
     * @throws IOException
     */
    private static int getPdfPageCount(String filePath) throws IOException {
        try (PDDocument document = Loader.loadPDF(new File(filePath))) {
//            PDDocument document = new PDDocument();
            int numberOfPages = document.getNumberOfPages();
            document.close();
            return numberOfPages;
        }
    }

    /**
     * 获取doc文档页数
     * @param filePath
     * @return
     * @throws IOException
     */
    private static int getDocPageCount(String filePath) throws IOException {
//        try (InputStream inputStream = new FileInputStream(filePath);
//             HWPFDocument document = new HWPFDocument(inputStream)) {
//            int pageCount = document.getSummaryInformation().getPageCount();
//            document.close();
//            return pageCount;
//        }
        try (InputStream inputStream = new FileInputStream(filePath)) {
            com.aspose.words.Document doc = new com.aspose.words.Document(inputStream);
            int num = doc.getPageCount();
            doc.cleanup();
            return num;
        } catch (Exception e) {
            e.printStackTrace();
            return 0;
        }
    }

    /**
     * 获取docx页数
     * @param filePath
     * @return
     * @throws IOException
     */
    private static int getDocxPageCount(String filePath) throws IOException {
//        try (InputStream inputStream = new FileInputStream(filePath);
//             XWPFDocument document = new XWPFDocument(inputStream)) {
//            int pages = document.getProperties().getExtendedProperties().getUnderlyingProperties().getPages();
//            document.close();
//            return pages;
//        }
        try (InputStream inputStream = new FileInputStream(filePath)) {
            com.aspose.words.Document doc = new com.aspose.words.Document(inputStream);
            int num = doc.getPageCount();
            doc.cleanup();
            return num;
        } catch (Exception e) {
            e.printStackTrace();
            return 0;
        }

    }

    /**
     * pdf页数
     * @param filePath
     * @return
     * @throws IOException
     */
    private static int getOfdPageCount(String filePath) throws IOException {
        Path ofdFile = Paths.get(filePath);
        OFDReader ofdReader = new OFDReader(ofdFile);
        int numberOfPages = ofdReader.getNumberOfPages();
        ofdReader.close();
        return numberOfPages;
    }


    /**
     * 获取缓存文件页数
     * @param inputStream
     * @param originalFilename
     * @return
     */
    public static Integer getPageCount(MultipartFile inputStream, String originalFilename) {
        try (InputStream inputStream1 = inputStream.getInputStream()) {
            return getPageCount(inputStream1,originalFilename);
        } catch (IOException e) {
            log.warn("读取文件异常:{},{}", originalFilename,e);
            return 0;
        }
    }

// Add methods for other document types as needed
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147

2. 文本提取

import cn.hutool.core.io.FileUtil;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FilenameUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.ofdrw.converter.export.TextExporter;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * @author yilv
 * @version 1.0
 * @description: TODO
 * @date 2023/11/16 16:12
 */
@Slf4j
public class LvDocTxTHunter {
    private static AtomicInteger  UPPER_LIMIT=new AtomicInteger(50);
    /**
     * 读取文档内容
     * @param filePath
     * @return
     */
    public static String readText(String filePath) {

        int pageCount = LvDocPageCounter.getPageCount(filePath);
        if (pageCount >UPPER_LIMIT.get()) {
            log.warn("文件过大:{},{}", filePath,pageCount);
            return "";
        }
        String fileType = getFileType(filePath);
        try {
            switch (fileType) {
                case "pdf":
                    return readPdfText(filePath);
                case "doc":
                    return readDocText(filePath);
                case "docx":
                    return readDocxText(filePath);
                case "ofd":
                    return readOfdText(filePath);
                // Add more cases for other document types as needed
                default:
                    log.warn("不支持的文件类型:{}", filePath);
                    return "";
            }
        } catch (IOException e) {
            log.warn("读取文件异常:{},{}", filePath,e);
            return "";
        }

    }

    /**
     * 获取文件类型
     * @param filePath
     * @return
     */
    private static String getFileType(String filePath) {
        int dotIndex = filePath.lastIndexOf('.');
        if (dotIndex == -1 || dotIndex == filePath.length() - 1) {
            log.warn("文件名中没有找到扩展名:{}", filePath);
            return "";
        }
        return filePath.substring(dotIndex + 1).toLowerCase();
    }

    /**
     * 获取pdf文本
     * @param filePath
     * @return
     * @throws IOException
     */
    private static String readPdfText(String filePath) throws IOException {
        try (PDDocument document = Loader.loadPDF(filePath)) {
            String text = new PDFTextStripper().getText(document);
            document.close();
            return text;
        }
    }

    /**
     * 获取doc文本
     * @param filePath
     * @return
     * @throws IOException
     */
    private static String readDocText(String filePath) throws IOException {
        try (InputStream inputStream = new FileInputStream(filePath);
             HWPFDocument document = new HWPFDocument(inputStream)) {
            WordExtractor extractor = new WordExtractor(document);
            String text = extractor.getText();
            document.close();
            return text;
        }
    }

    /**
     * 获取docx文本
     * @param filePath
     * @return
     * @throws IOException
     */
    private static String readDocxText(String filePath) throws IOException {
        try (InputStream inputStream = new FileInputStream(filePath);
             XWPFDocument document = new XWPFDocument(inputStream)) {
            XWPFWordExtractor extractor = new XWPFWordExtractor(document);
            String text = extractor.getText();
            document.close();
            return text;
        }
    }
    /**
     * pdf页数
     * @param filePath
     * @return
     * @throws IOException
     */
    private static String readOfdText(String filePath) throws IOException {
        Path txtPath = Paths.get("DOCUMENT_PAGE_TEMP", FilenameUtils.getBaseName(filePath) + ".txt");
        TextExporter textExporter = new TextExporter(Paths.get(filePath), txtPath);
        textExporter.export();
        String s = FileUtil.readUtf8String(txtPath.toFile());
        textExporter.close();
        return s;
    }

    /**
     * 获取文件文本
     * @param tempFile
     * @return
     */
    public static String readText(File tempFile) {
        return readText(tempFile.getPath());
    }
    // Add methods for other document types as needed
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147

3. 文案转换

ofd转换

- ①启动加载字体

    /**
     * 前置系统数据加载
     */
    private static void systemInit() {
        FontLoader preload = FontLoader.Preload();
        //todo 重要 设置语言环境为 zh,方正多环境加载字体不一致
        Locale.setDefault(Locale.SIMPLIFIED_CHINESE);
        preload.scanFontDir(Paths.get(FileUtil.local, "font"));
        Field namePathMapping = ReflectUtil.getField(FontLoader.class, "fontNamePathMapping");
        Map<String, String> fontNamePathMapping = (Map<String, String>) ReflectUtil.getFieldValue(preload,namePathMapping);
        System.out.println("加载字体:" + JSONUtil.toJsonStr(fontNamePathMapping.keySet()));
    }
1
2
3
4
5
6
7
8
9
10
11
12

- ②使用ofdrw进行pdf转换

    /**
     * 将OFD转换为PDF
     *
     * @param ofdPath OFD路径
     * @param distPath 输出路径
     * @param pdfPath 输出PDF路径
     * @throws IOException
     */
    public static void convertOfdToPDFByBridge(String ofdPath, String distPath, String pdfPath) throws IOException {

        log.debug("解析文件:{}",ofdPath);
        Path ofdFilePath = Paths.get(ofdPath);
        Path dir = Paths.get(distPath);
        PDFExporterIText exporter = new PDFExporterIText(ofdFilePath, Paths.get(pdfPath));
        exporter.export();
        exporter.close();
    }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

相关阅读:
SolidWorks2021导出带材质的OBJ文件
 eclipse启动tomcat是出现Server Tomcat v9.0 Server at localhost failed to start.错误
 【C++刷题】二叉树进阶刷题
 看完这篇教你玩转渗透测试靶机vulnhub——VICTIM: 1
机器学习：Softmax介绍及代码实现
 vue学习之列表渲染
 第21章自旋锁实验（iTOP-RK3568开发板驱动开发指南）
「Verilog学习笔记」使用generate…for语句简化代码
 FRC-EP系列--你的汽车数据一站式管家
 Git --》如何在IDEA中玩转Git与GitHub？
原文地址：https://blog.csdn.net/weixin_53083143/article/details/134505975

开发手账（一）

一、 关于设计

（一）数据库

二、关于组件

（一）POI

1. 文档页数统计

2. 文本提取

3. 文案转换

一、关于设计