一、 关于设计
(一)数据库
- 确定外键标识,需判断该外键是否有可能被修改。如菜单id,菜单code,菜单名,前两者都可做外键,后面一个则不应做外键。
二、关于组件
(一)POI
1. 文档页数统计
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.ofdrw.reader.OFDReader;
import org.springframework.web.multipart.MultipartFile;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.nio.file.Paths;
@Slf4j
public class LvDocPageCounter {
public static final String DOCUMENT_PAGE_TEMP = "DOCUMENT_PAGE_TEMP";
public static int getPageCount(String filePath) {
String fileType = getFileType(filePath);
try {
switch (fileType) {
case "pdf":
return getPdfPageCount(filePath);
case "docx":
return getDocxPageCount(filePath);
case "doc":
return getDocPageCount(filePath);
case "ofd":
return getOfdPageCount(filePath);
default:
log.warn("不支持的文件类型:{}", filePath);
return 1;
}
} catch (Exception e) {
log.warn("读取文件异常:{},{}", filePath,e);
return 0;
}
}
private static String getFileType(String filePath) {
int dotIndex = filePath.lastIndexOf('.');
if (dotIndex == -1 || dotIndex == filePath.length() - 1) {
log.warn("文件名中没有找到扩展名:{}", filePath);
return "";
}
return filePath.substring(dotIndex + 1).toLowerCase();
}
private static int getPdfPageCount(String filePath) throws IOException {
try (PDDocument document = Loader.loadPDF(new File(filePath))) {
int numberOfPages = document.getNumberOfPages();
document.close();
return numberOfPages;
}
}
private static int getDocPageCount(String filePath) throws IOException {
try (InputStream inputStream = new FileInputStream(filePath)) {
com.aspose.words.Document doc = new com.aspose.words.Document(inputStream);
int num = doc.getPageCount();
doc.cleanup();
return num;
} catch (Exception e) {
e.printStackTrace();
return 0;
}
}
private static int getDocxPageCount(String filePath) throws IOException {
try (InputStream inputStream = new FileInputStream(filePath)) {
com.aspose.words.Document doc = new com.aspose.words.Document(inputStream);
int num = doc.getPageCount();
doc.cleanup();
return num;
} catch (Exception e) {
e.printStackTrace();
return 0;
}
}
private static int getOfdPageCount(String filePath) throws IOException {
Path ofdFile = Paths.get(filePath);
OFDReader ofdReader = new OFDReader(ofdFile);
int numberOfPages = ofdReader.getNumberOfPages();
ofdReader.close();
return numberOfPages;
}
public static Integer getPageCount(MultipartFile inputStream, String originalFilename) {
try (InputStream inputStream1 = inputStream.getInputStream()) {
return getPageCount(inputStream1,originalFilename);
} catch (IOException e) {
log.warn("读取文件异常:{},{}", originalFilename,e);
return 0;
}
}
}

- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
- 117
- 118
- 119
- 120
- 121
- 122
- 123
- 124
- 125
- 126
- 127
- 128
- 129
- 130
- 131
- 132
- 133
- 134
- 135
- 136
- 137
- 138
- 139
- 140
- 141
- 142
- 143
- 144
- 145
- 146
- 147
2. 文本提取
import cn.hutool.core.io.FileUtil;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FilenameUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.ofdrw.converter.export.TextExporter;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.concurrent.atomic.AtomicInteger;
@Slf4j
public class LvDocTxTHunter {
private static AtomicInteger UPPER_LIMIT=new AtomicInteger(50);
public static String readText(String filePath) {
int pageCount = LvDocPageCounter.getPageCount(filePath);
if (pageCount >UPPER_LIMIT.get()) {
log.warn("文件过大:{},{}", filePath,pageCount);
return "";
}
String fileType = getFileType(filePath);
try {
switch (fileType) {
case "pdf":
return readPdfText(filePath);
case "doc":
return readDocText(filePath);
case "docx":
return readDocxText(filePath);
case "ofd":
return readOfdText(filePath);
default:
log.warn("不支持的文件类型:{}", filePath);
return "";
}
} catch (IOException e) {
log.warn("读取文件异常:{},{}", filePath,e);
return "";
}
}
private static String getFileType(String filePath) {
int dotIndex = filePath.lastIndexOf('.');
if (dotIndex == -1 || dotIndex == filePath.length() - 1) {
log.warn("文件名中没有找到扩展名:{}", filePath);
return "";
}
return filePath.substring(dotIndex + 1).toLowerCase();
}
private static String readPdfText(String filePath) throws IOException {
try (PDDocument document = Loader.loadPDF(filePath)) {
String text = new PDFTextStripper().getText(document);
document.close();
return text;
}
}
private static String readDocText(String filePath) throws IOException {
try (InputStream inputStream = new FileInputStream(filePath);
HWPFDocument document = new HWPFDocument(inputStream)) {
WordExtractor extractor = new WordExtractor(document);
String text = extractor.getText();
document.close();
return text;
}
}
private static String readDocxText(String filePath) throws IOException {
try (InputStream inputStream = new FileInputStream(filePath);
XWPFDocument document = new XWPFDocument(inputStream)) {
XWPFWordExtractor extractor = new XWPFWordExtractor(document);
String text = extractor.getText();
document.close();
return text;
}
}
private static String readOfdText(String filePath) throws IOException {
Path txtPath = Paths.get("DOCUMENT_PAGE_TEMP", FilenameUtils.getBaseName(filePath) + ".txt");
TextExporter textExporter = new TextExporter(Paths.get(filePath), txtPath);
textExporter.export();
String s = FileUtil.readUtf8String(txtPath.toFile());
textExporter.close();
return s;
}
public static String readText(File tempFile) {
return readText(tempFile.getPath());
}
}

- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
- 117
- 118
- 119
- 120
- 121
- 122
- 123
- 124
- 125
- 126
- 127
- 128
- 129
- 130
- 131
- 132
- 133
- 134
- 135
- 136
- 137
- 138
- 139
- 140
- 141
- 142
- 143
- 144
- 145
- 146
- 147
3. 文案转换
private static void systemInit() {
FontLoader preload = FontLoader.Preload();
Locale.setDefault(Locale.SIMPLIFIED_CHINESE);
preload.scanFontDir(Paths.get(FileUtil.local, "font"));
Field namePathMapping = ReflectUtil.getField(FontLoader.class, "fontNamePathMapping");
Map<String, String> fontNamePathMapping = (Map<String, String>) ReflectUtil.getFieldValue(preload,namePathMapping);
System.out.println("加载字体:" + JSONUtil.toJsonStr(fontNamePathMapping.keySet()));
}
public static void convertOfdToPDFByBridge(String ofdPath, String distPath, String pdfPath) throws IOException {
log.debug("解析文件:{}",ofdPath);
Path ofdFilePath = Paths.get(ofdPath);
Path dir = Paths.get(distPath);
PDFExporterIText exporter = new PDFExporterIText(ofdFilePath, Paths.get(pdfPath));
exporter.export();
exporter.close();
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17