Java 文件处理系列之:word转pdf

日常操作中,word转pdf是较为常见的操作 。尤其是前端上传word文档,需要在页面预览文档的情况 。前端直接预览word需要特殊的处理,但是如果由后端先把word转为pdf,再预览,就会比较简单 。
效果预览:

Java 文件处理系列之:word转pdf

原始word文件.docx
Java 文件处理系列之:word转pdf

转换之后的pdf文件.pdf


接下来就分享实测过的实现方式 。
环境:JDK11、Springboot 2.3.7.RELEASE、windows10、Maven
  1. 第一步,Maven 依赖配置,主要导入一些工具包
org.springframework.bootspring-boot-starter-weborg.springframework.bootspring-boot-starter-testtestorg.projectlomboklomboktrueorg.apache.commonscommons-lang33.4com.deepoovepoi-tl1.10.2fr.opensagres.xdocreportfr.opensagres.poi.xwpf.converter.pdf2.0.2org.apache.poipoi-scratchpad4.1.2fr.opensagres.xdocreportfr.opensagres.poi.xwpf.converter.core2.0.2fr.opensagres.xdocreportfr.opensagres.poi.xwpf.converter.xhtml2.0.2com.itextpdfitextpdf5.5.13.2com.itextpdf.toolxmlworker5.5.13.2com.itextpdfitext-asian5.2.0com.itextpdfhtml2pdf4.0.1org.jsoupjsoup1.14.3
  1. 第二步 , service 业务层构造
package com.yalin.cn.fileutil.word.service;import java.io.InputStream;import java.io.OutputStream;/** * @description: word生成pdf * @author: lyl * @create: 2021-05-08 16:31:47 **/public interface IWordConvertPdfService {/*** docx 转pdf** @param sourcePath word路径* @param targetPath pdf路径* @param imageDirword中的图片临时存放路径* @return boolean*/boolean convert(String sourcePath, String targetPath, String imageDir);/*** docx 转pdf** @param inword文件流* @param targetPath pdf路径* @param imageDirword中的图片临时存放路径* @return boolean*/boolean convert(InputStream in, String targetPath, String imageDir);/*** docx 转pdf** @param inword文件流* @param outpdf文件流* @param imageDir word中的图片临时存放路径* @return boolean* @throws Exception 抛出异常*/boolean convert(InputStream in, OutputStream out, String imageDir) throws Exception;}
  1. 第三步,service impl 业务实现层构造
package com.yalin.cn.fileutil.word.service.impl;import com.yalin.cn.fileutil.util.OfficeUtil;import com.yalin.cn.fileutil.word.service.IWordConvertPdfService;import lombok.extern.slf4j.Slf4j;import org.springframework.stereotype.Service;import java.io.InputStream;import java.io.OutputStream;import java.nio.file.Files;import java.nio.file.Paths;import java.util.Objects;/** * @description: word生成pdf * @author: lyl * @create: 2021-05-08 16:31:47 **/@Service@Slf4jpublic class WordConvertPdfServiceImpl implements IWordConvertPdfService {/*** docx 转pdf** @param sourcePath word路径* @param targetPath pdf路径* @param imageDirword中的图片临时存放路径* @return boolean*/@Overridepublic boolean convert(String sourcePath, String targetPath, String imageDir) {try (InputStream inputStream = Files.newInputStream(Paths.get(sourcePath));OutputStream outputStream = Files.newOutputStream(Paths.get(targetPath))) {return convert(inputStream, outputStream, imageDir);} catch (Exception e) {log.error("convert(String, String, String)异常:{}", e);}return false;}/*** docx 转pdf** @param inword文件流* @param targetPath pdf路径* @param imageDirword中的图片临时存放路径* @return boolean*/@Overridepublic boolean convert(InputStream in, String targetPath, String imageDir) {try (OutputStream outputStream = Files.newOutputStream(Paths.get(targetPath))) {return convert(in, outputStream, imageDir);} catch (Exception e) {log.error("convert(String, String, String)异常:{}", e);}return false;}/*** docx 转pdf** @param inword文件流* @param outpdf文件流* @param imageDir word中的图片临时存放路径* @return boolean*/@Overridepublic boolean convert(InputStream in, OutputStream out, String imageDir) throws Exception {if (Objects.isNull(in)) {throw new Exception("模板文件流为null!");}if (Objects.isNull(out)) {throw new Exception("目标文件流为null!");}try {// word转pdfOfficeUtil.docxConvertPdf(in, out, imageDir);return true;} catch (Exception e) {log.error("fill(InputStream, OutputStream, String)异常:{}", e);}return false;}}
  1. 第四步,真正实现转换的工具类
【Java 文件处理系列之:word转pdf】package com.yalin.cn.fileutil.util;import com.itextpdf.text.*;import com.itextpdf.text.pdf.BaseFont;import com.itextpdf.text.pdf.PdfWriter;import com.itextpdf.tool.xml.XMLWorkerHelper;import com.yalin.cn.fileutil.font.AutoFontFactory;import fr.opensagres.poi.xwpf.converter.core.BasicURIResolver;import fr.opensagres.poi.xwpf.converter.core.FileImageExtractor;import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;import org.apache.commons.lang3.StringUtils;import org.apache.poi.xwpf.usermodel.XWPFDocument;import org.jsoup.Jsoup;import org.jsoup.nodes.Element;import org.jsoup.nodes.Entities;import org.jsoup.select.Elements;import java.io.*;import java.nio.charset.Charset;import java.util.Objects;/** * @description: word 转pdf * @author: lyl * @create: 2021-04-23 11:09:51 **/public class OfficeUtil {/*** 将docx格式文件转成html** @param indocx文件流* @param imageDir docx文件中图片存储目录* @return html*/public static String docx2Html(InputStream in, String imageDir) throws Exception {String content = null;ByteArrayOutputStream baos = null;try {// 1> 加载文档到XWPFDocumentXWPFDocument document = new XWPFDocument(in);// 2> 解析XHTML配置(这里设置IURIResolver来设置图片存放的目录)XHTMLOptions options = XHTMLOptions.create();// 存放word中图片的目录if (Objects.nonNull(imageDir)) {options.setExtractor(new FileImageExtractor(new File(imageDir)));options.URIResolver(new BasicURIResolver(imageDir));options.setIgnoreStylesIfUnused(false);options.setFragment(true);}// 3> 将XWPFDocument转换成XHTMLbaos = new ByteArrayOutputStream();XHTMLConverter.getInstance().convert(document, baos, options);} catch (Exception e) {e.printStackTrace();throw new Exception(e);} finally {try {if (in != null) {in.close();}if (baos != null) {content = new String(baos.toByteArray(), "utf-8");baos.close();}} catch (Exception e) {e.printStackTrace();}}return content;}/*** 使用jsoup规范化html** @param html html内容* @return 规范化后的html*/private static String formatHtml(String html) {org.jsoup.nodes.Document doc = Jsoup.parse(html);// 去除过大的宽度String style = doc.attr("style");if (StringUtils.isNotEmpty(style) && style.contains("width")) {doc.attr("style", "");}Elements divs = doc.select("div");for (Element div : divs) {String divStyle = div.attr("style");if (StringUtils.isNotEmpty(divStyle) && divStyle.contains("width")) {div.attr("style", "");}}// jsoup生成闭合标签doc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml);doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);return doc.html();}/*** html转成pdf** @param html html* @param out输出pdf文件流*/public static void htmlToPdf(String html, OutputStream out) throws Exception {Document document = null;ByteArrayInputStream bais = null;try {// 纸document = new Document(PageSize.A4);// 笔PdfWriter writer = PdfWriter.getInstance(document, out);document.open();// html转pdfbais = new ByteArrayInputStream(html.getBytes("UTF-8"));XMLWorkerHelper.getInstance().parseXHtml(writer, document, bais,Charset.forName("UTF-8"), new FontProvider() {@Overridepublic boolean isRegistered(String s) {return false;}@Overridepublic Font getFont(String s, String s1, boolean embedded, float size, int style, BaseColor baseColor) {// 配置字体Font font = null;try {BaseFont bf = AutoFontFactory.getBaseFont();font = new Font(bf, size, style, baseColor);font.setColor(baseColor);} catch (Exception e) {e.printStackTrace();}return font;}});} catch (Exception e) {e.printStackTrace();throw new Exception(e);} finally {if (document != null) {document.close();}if (bais != null) {try {bais.close();} catch (IOException e) {e.printStackTrace();}}}}/*** docx 转pdf** @param indocx文件流* @param outpdf文件流* @param imageDir docx中图片存放路径* @return boolean*/public static boolean docxConvertPdf(InputStream in, OutputStream out, String imageDir) {try {String docxHtml = docx2Html(in, imageDir);docxHtml = formatHtml(docxHtml);htmlToPdf(docxHtml, out);return true;} catch (Exception e) {e.printStackTrace();}return false;}}
备注:OfficeUtil中的AutoFontFactory属于自定义的字体 。因为linux环境下不支持某些中文字体,导致乱码 。解决方案之一,就是从windows字体库中复制一个 , 放到resource目录下,在代码中引用即可 。
package com.yalin.cn.fileutil.font;import com.itextpdf.text.DocumentException;import com.itextpdf.text.pdf.BaseFont;import java.io.IOException;/** * @description: 字体工厂 * @author: lyl * @create: 2022-01-17 15:38:29 **/public class AutoFontFactory {/*** 获取基础字体** @return BaseFont* @throws IOException* @throws DocumentException*/public static BaseFont getBaseFont() throws IOException, DocumentException {// 方案一:使用资源字体(需要有字体)BaseFont bf = BaseFont.createFont("/font/simsun.ttc,0", BaseFont.IDENTITY_H, BaseFont.EMBEDDED);// 方案二:使用本地字体(本地需要有字体)// BaseFont bf = BaseFont.createFont("C:/Windows/Fonts/seguisym.ttf", BaseFont.IDENTITY_H, BaseFont.EMBEDDED);// 方案二:使用jar包:iTextAsian,这样只需一个jar包就可以了// BaseFont bf = BaseFont.createFont("STSong-Light", "UniGB-UCS2-H", BaseFont.EMBEDDED);return bf;}}
测试类
@Testvoid wordConvertPdf() {String basePath = "C:\Users\lyl\Desktop\";String sourcePath = basePath"原始word文件.docx";String targetPath = basePath"转换之后的pdf文件.pdf";String imagePath = basePath"img"File.separator;WordConvertPdfServiceImpl tt = new WordConvertPdfServiceImpl();boolean flag = tt.convert(sourcePath, targetPath, imagePath);System.out.println(flag);}

相关经验推荐