读取DOC、DOCX、 XLS 、XLSX 、PDF 、PPTX 、TXT文档内容
POM 依赖
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.1.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml-schemas -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>4.1.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-scratchpad -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.1.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/fontbox -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>fontbox</artifactId>
<version>2.0.12</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.12</version>
</dependency>
代码
package org.jeecg.common.util;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Iterator;
import java.util.List;
import org.apache.pdfbox.io.RandomAccessBuffer;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.SlideShow;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellType;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFShape;
import org.apache.poi.xslf.usermodel.XSLFTextParagraph;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFPictureData;
import org.jeecg.common.exception.JeecgBootException;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class FileReadUtil {
public static String getContent(String path) {
File file = new File(path);
if (!file.exists()) {
log.error("{}文件不存在", path);
return null;
}
String name = file.getName();
if (name.indexOf(".") > -1) {
String type = name.substring(name.lastIndexOf(".") + 1);
log.info("文件名称:{}", name);
if ("doc".equals(type)) {
return readWord(path);
} else if ("docx".equals(type)) {
return readDoc(path);
} else if ("xls".equals(type)) {
return readXls(path);
} else if ("xlsx".equals(type)) {
return readXlsx(path);
} else if ("pdf".equals(type)) {
return readPdf(path);
} else if ("ppt".equals(type)) {
return readPPT(path);
} else if ("pptx".equals(type)) {
return readPPTX(path);
} else if ("txt".equals(type)) {
return readTxt(path);
} else {
log.info("不支持读取{}类型的文件类型", type);
return null;
}
}
return null;
}
public static String readTxt(String path) {
Path path2 = Paths.get(path);
try {
List<String> strings = Files.readAllLines(path2);
return String.join("\t", strings);
} catch (IOException e) {
log.error(e.getMessage(), e);
throw new JeecgBootException(e.getMessage());
}
}
public static String readWord(String path) {
FileInputStream in;
String text = null;
try {
in = new FileInputStream(new File(path));
WordExtractor extractor = new WordExtractor(in);
text = extractor.getText();
} catch (FileNotFoundException e) {
log.error(e.getMessage(), e);
throw new JeecgBootException(e.getMessage());
} catch (Exception e) {
log.error(e.getMessage(), e);
throw new JeecgBootException(e.getMessage());
}
return text;
}
public static String readDoc(String path) {
XWPFWordExtractor wordExtractor = null;
try {
FileInputStream fis = new FileInputStream(new File(path));
XWPFDocument xdoc = new XWPFDocument(fis);
wordExtractor = new XWPFWordExtractor(xdoc);
} catch (IOException e) {
log.error(e.getMessage(), e);
throw new JeecgBootException(e.getMessage());
}
return wordExtractor.getText();
}
public static String readXls(String path) {
StringBuilder content = new StringBuilder();
try {
HSSFWorkbook excel = new HSSFWorkbook(new FileInputStream(new File(path)));
// 获取第一个sheet
HSSFSheet sheet0 = excel.getSheetAt(0);
for (Iterator<Row> rowIterator = sheet0.iterator(); rowIterator.hasNext();) {
HSSFRow row = (HSSFRow) rowIterator.next();
for (Iterator<Cell> iterator = row.cellIterator(); iterator.hasNext();) {
HSSFCell cell = (HSSFCell) iterator.next();
// 根据单元的的类型 读取相应的结果
if (cell.getCellType() == CellType.STRING)
content.append(cell.getStringCellValue() + "\t");
else if (cell.getCellType() == CellType.NUMERIC ||
cell.getCellType() == CellType.FORMULA)
content.append(cell.getNumericCellValue() + "\t");
else
content.append("" + "\t");
}
}
} catch (Exception e) {
log.error(e.getMessage(), e);
throw new JeecgBootException(e.getMessage());
}
return content.toString();
}
public static String readXlsx(String path) {
StringBuilder content = new StringBuilder();
try {
XSSFWorkbook excel = new XSSFWorkbook(new FileInputStream(new File(path)));
// 获取第一个sheet
XSSFSheet sheet0 = excel.getSheetAt(0);
for (Iterator<Row> rowIterator = sheet0.iterator(); rowIterator.hasNext();) {
XSSFRow row = (XSSFRow) rowIterator.next();
for (Iterator<Cell> iterator = row.cellIterator(); iterator.hasNext();) {
XSSFCell cell = (XSSFCell) iterator.next();
// 根据单元格的类型 读取相应的结果
if (cell.getCellType() == CellType.STRING)
content.append(cell.getStringCellValue() + "\t");
else if (cell.getCellType() == CellType.NUMERIC || cell.getCellType() == CellType.FORMULA)
content.append(cell.getNumericCellValue() + "\t");
else
content.append("" + "\t");
}
}
} catch (Exception e) {
log.error(e.getMessage(), e);
throw new JeecgBootException(e.getMessage());
}
return content.toString();
}
public static String readPdf(String path) {
StringBuilder content = new StringBuilder();
try {
InputStream is = new FileInputStream(new File(path));
PDFParser parser = new PDFParser(new RandomAccessBuffer(is));
parser.parse();
// 读取文本内容
PDDocument document = parser.getPDDocument();
// 获取页码
int pages = document.getNumberOfPages();
PDFTextStripper stripper = new PDFTextStripper();
// 设置按顺序输出
stripper.setSortByPosition(true);
stripper.setStartPage(1);
stripper.setEndPage(pages);
content.append(stripper.getText(document));
} catch (Exception e) {
log.error(e.getMessage(), e);
throw new JeecgBootException(e.getMessage());
}
return content.toString();
}
public static String readPPT(String path) {
try {
InputStream is = new FileInputStream(new File(path));
PowerPointExtractor extractor=new PowerPointExtractor(is);
return extractor.getText();
} catch (IOException e) {
log.error(e.getMessage(), e);
throw new JeecgBootException(e.getMessage());
}
}
@SuppressWarnings("resource")
public static String readPPTX(String path) {
try {
File doc = new File(path);
OPCPackage pkg = OPCPackage.open(doc);
SlideShow<XSLFShape, XSLFTextParagraph> sss = new XMLSlideShow(pkg);
SlideShowExtractor<XSLFShape, XSLFTextParagraph> extractor = new SlideShowExtractor<>(sss);
return extractor.getText();
} catch (Exception e) {
log.error(e.getMessage(), e);
throw new JeecgBootException(e.getMessage());
}
}
}
|