替换100个模板中部分标签后,并合并100个pdf模板文档、10个400kb的图片为一个pdf文档
耗时20s左右
?1.导入pdfbox
<dependencies>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/log4j/log4j -->
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<dependency>
<groupId>maven-repository.junit</groupId>
<artifactId>junit</artifactId>
<version>4.13.2</version>
</dependency>
<!-- word转pdf -->
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>fr.opensagres.poi.xwpf.converter.pdf</artifactId>
<version>2.0.2</version>
</dependency>
</dependencies>
?2.上代码
package main.java;
import fr.opensagres.poi.xwpf.converter.pdf.PdfConverter;
import fr.opensagres.poi.xwpf.converter.pdf.PdfOptions;
import org.apache.log4j.Logger;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.multipdf.PDFMergerUtility;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.junit.Test;
import javax.imageio.ImageIO;
import javax.imageio.ImageReader;
import javax.imageio.stream.ImageInputStream;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
/**
* @ClassName PdfboxSummary
*/
public class PdfboxSummary {
private final static Logger log = Logger.getLogger(PdfboxReplace.class);
/**
* 将某个文件夹下的pdf模板文件中的标签替换,如果有图片并把图片插入pdf,生成一个pdf文档
* @throws Exception
*/
@Test
public void pdfMergeONE() throws Exception {
//合并为一个文件的文件路径
String outputFile="D:\\merged.pdf";
long start = System.currentTimeMillis();
System.out.println("===start==="+start);
//需要替换的标签数据key放标签,value放替换成哪个值
HashMap<String,String> replaceMap = new HashMap();
replaceMap.put("<<D1>>","D1D1D1");
replaceMap.put("<<F7>>","F7F7F7");
replaceMap.put("<<Annual>>","AnnualAnnualAnnual");
replaceMap.put("<<E6>>","E6E6E6E6E6");
replaceMap.put("<<Month>>","MonthMonthMonth");
replaceMap.put("<<EffDate>>","EffDateEffDateEffDate");
replaceMap.put("<<R22>>","R22R22R22R22");
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
//pdfMergerUtility.setDestinationFileName(FILEPATH + "test\\merged.pdf");
PDDocument destination = new PDDocument();
//获取文件目录下要处理的文件名称列表
List<String> fileNameList = getFile("D:\\merge");
for (int i=0;i<fileNameList.size();i++){
String filePath = fileNameList.get(i);
String typeStr = filePath.substring(filePath.indexOf(".")+1);
//System.out.println(typeStr);
if ("pdf".equalsIgnoreCase(typeStr)){
//处理pdf文档
File pdfFile = new File(filePath);
PDDocument pdfDocument = PDDocument.load(pdfFile);
for (String key:replaceMap.keySet()) {
replacePdfText(pdfDocument, key, replaceMap.get(key));
}
//把替换完标签的PDDocument pdfdocument合并到目标文件PDDocument destination
pdfMergerUtility.appendDocument(destination,pdfDocument);
pdfDocument.close();
}
if ("jpg".equalsIgnoreCase(typeStr)||"png".equalsIgnoreCase(typeStr)
||"jpeg".equalsIgnoreCase(typeStr)){
//处理插入图片
insertImageToPdf(destination,filePath);
}
}
//pdf合并
pdfMergerUtility.mergeDocuments(MemoryUsageSetting.setupMainMemoryOnly());
destination.save(outputFile);
destination.close();
long end = System.currentTimeMillis();
System.out.println("===end==="+end);
long total = end - start;
System.out.println("===total==="+ total);
}
/**
* 向pdf中插入图片
* @param document
* @param imagePath
* @return
* @throws IOException
*/
private static PDDocument insertImageToPdf(PDDocument document ,String imagePath) throws IOException {
PDPage page = new PDPage();
//创建PDImageXObject对象
PDImageXObject pdImage = PDImageXObject.createFromFile(imagePath,document);
//创建PDPageContentStream对象
PDPageContentStream contents = new PDPageContentStream(document, page);
//插入图片,图片太大按scale比例缩小
float pageWidth = page.getMediaBox().getWidth();
float pageHeight = page.getMediaBox().getHeight();
int imageHeight = pdImage.getHeight();
int imageWidth = pdImage.getWidth();
float scale = pageWidth/imageWidth;
scale = Math.min(1,scale);
contents.drawImage(pdImage, (pageWidth-imageWidth*scale)/2, (pageHeight-imageHeight*scale)/2, imageWidth * scale, imageHeight * scale);
document.addPage(page);
contents.close();
return document;
}
/**
* 替换PDF中标签字符串
* @param document
* @param searchString
* @param replacement
* @return
* @throws IOException
*/
private static PDDocument replacePdfText(PDDocument document, String searchString, String replacement) throws IOException {
for (PDPage page : document.getPages()) {
PDFStreamParser parser = new PDFStreamParser(page);
parser.parse();
List tokens = parser.getTokens();
List<COSArray> keyList = new ArrayList();
String pstring = "";
boolean isStart = false;
for (int j = 0; j < tokens.size(); j++) {
Object next = tokens.get(j);
if (next instanceof Operator) {
Operator op = (Operator) next;
//Tj和TJ是在PDF中显示字符串的两个运算符
if (op.getName().equals("Tj")) {
// Tj takes one operator and that is the string to display so lets update that operator
//Tj是一种字符串形式的运算符,所以直接更新就行
COSString previous = (COSString) tokens.get(j - 1);
String string = previous.getString();
string = string.replace(searchString, replacement);
previous.setValue(string.getBytes());
} else if (op.getName().equals("TJ")) {
//Tj是一种字符数组形式的运算符
COSArray previous = (COSArray) tokens.get(j - 1);
for (int k = 0; k < previous.size(); k++) {
Object arrElement = previous.getObject(k);
if (arrElement instanceof COSString) {
COSString cosString = (COSString) arrElement;
String string = cosString.getString();
//由于<<A1>>标签解析时,可能被解析成 "<<A1" ">>" 或 "<" "<" "A1" ">" ">"
//所以下面特殊处理下
//System.out.println(string);
if (pstring.contains("<<") || string.contains("<<")
|| pstring.contains("<") || string.contains("<")) {
pstring += string;
}
}
}
if (pstring.contains("<<")) {
isStart = true;
//System.out.println(pstring);
}
//if (searchString.equals(pstring.trim())) {
if (pstring.contains("<<") && pstring.contains(">>")
&&searchString.equals(pstring.trim())) {
System.out.println(pstring);
keyList.add(previous);
for (int i = 0; i < keyList.size(); i++) {
COSArray item = keyList.get(i);
if (i == 0) {
COSString cosString2 = (COSString) item.getObject(0);
cosString2.setValue(replacement.getBytes());
int total = item.size() - 1;
for (int k = total; k > 0; k--) {
item.remove(k);
}
} else {
while (item.size() > 0) {
item.remove(0);
}
}
}
keyList.clear();
pstring = "";
isStart = false;
} else {
if (isStart) {
keyList.add(previous);
}
}
}
if (pstring.contains(">>")) {
pstring = "";
isStart = false;
keyList.clear();
}
}
}
PDStream updatedStream = new PDStream(document);
OutputStream out = updatedStream.createOutputStream(COSName.FLATE_DECODE);
ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
tokenWriter.writeTokens(tokens);
out.close();
page.setContents(updatedStream);
}
return document;
}
/**
* 获取path文件夹下文件的路径
* @param path 文件夹路径
* @return
*/
private static List<String> getFile(String path) {
File file = new File(path);
//获取文件列表
File[] array = file.listFiles();
List<String> fileNameList = new ArrayList<>(100);
for (int i = 0; i < array.length; i++) {
if (array[i].isFile()) {
fileNameList.add(array[i].getPath());
}
//else if (array[i].isDirectory()) {
// getFile(array[i].getPath());
//}
}
return fileNameList;
}
/**
* 通过stream流的方式向pdf中插入内容
* @param pdfDocument
* @param filePath
* @return
* @throws Exception
*/
private static PDDocument insertToPdfByStream(PDDocument pdfDocument,String filePath) throws Exception {
//Iterator<ImageReader> iterator = ImageIO.getImageReadersByFormatName("tiff");
Iterator<ImageReader> iterator = ImageIO.getImageReadersByFormatName("jpeg");
if (!iterator.hasNext()) {
throw new Exception("The JDK does not support");
}
ImageReader imageReader = iterator.next();
long timeMillis = System.currentTimeMillis();
try (ByteArrayOutputStream outputStream = new ByteArrayOutputStream();) {
//ImageInputStream imageInputStream = ImageIO.createImageInputStream(new ByteArrayInputStream(tiffByte));
ImageInputStream imageInputStream = ImageIO.createImageInputStream(new FileInputStream(filePath));
imageReader.setInput(imageInputStream);
int size = imageReader.getNumImages(true);
for (int i = 0; i < size; i++) {
BufferedImage image = imageReader.read(i);
pageAddImage(pdfDocument, image);
}
pdfDocument.save(outputStream);
return pdfDocument;
//return outputStream.toByteArray();
} catch (IOException e) {
log.error("To PDF Page Error", e);
throw new Exception("Conversion PDF Error");
} finally {
log.info("to pdf used time: "+(System.currentTimeMillis() - timeMillis));
}
}
/**
* 向PDDocument newPdf 中添加图片
* @param newPdf
* @param image
* @throws IOException
*/
private static void pageAddImage(PDDocument newPdf, BufferedImage image) throws IOException {
//PDPage page = new PDPage(PDRectangle.A4);
PDPage page = new PDPage();
newPdf.addPage(page);
float width = page.getMediaBox().getWidth();
float height = page.getMediaBox().getHeight();
float scale = page.getMediaBox().getWidth() / image.getWidth();
scale = Math.min(1, scale);
float imgWidth = image.getWidth() * scale;
float imgHeight = image.getHeight() * scale;
try (PDPageContentStream pageContentStream = new PDPageContentStream(newPdf, page)) {
PDImageXObject pdImage = LosslessFactory.createFromImage(newPdf, image);
pageContentStream.drawImage(pdImage, (width - imgWidth) / 2, height - image.getHeight() * scale, imgWidth, imgHeight);
}
}
/**
* word转pdf 会出现内容丢失的情况
* @param docFilePath
* @param pdfFilePath
* @throws Exception
*/
private static void wordToPdf(String docFilePath,String pdfFilePath) throws Exception {
InputStream docFile = new FileInputStream(docFilePath);
XWPFDocument doc = new XWPFDocument(docFile);
PdfOptions pdfOptions = PdfOptions.create();
OutputStream out = new FileOutputStream(pdfFilePath);
PdfConverter.getInstance().convert(doc, out, pdfOptions);
doc.close();
out.close();
System.out.println(pdfFilePath);
}
}
参考博文:
用 Java 中的 PDFbox 替换或删除 PDF 中的文本 - IT屋-程序员软件开发技术分享社区
https://www.cnblogs.com/tankqiu/articles/4246776.html
教程 - PDFBox 中文文档 - 文江博客?
Word转为PDF(Java实现)_chengp919的博客-CSDN博客_java word转pdf
?
|