关于PDF文件表格提取实现_pdf

pdf文件内容格式比较特殊，目前根据实际观察发现pdf内容只是把源目标文件，一行一行读到pdf中，通过定位方式实现同版面展示，所以表格在pdf中表现形式比较特殊。
实现思路：
通过pdf内容识别，找到表格所属的页（只是提高一些速度，减少其他内容），然后将表格所在的页面截取到新的pdf文件中,然后将新生成的pdf转换为html文件，通过算法重新组装表格，此方法可识别空白列以及一个表格中存在多行数据的情况
用的技术框架：
jsoup ， itextpdf ， pdfbox
/** * 读取pdf文件转为list集合 * @param pdfPath * @return */public static List getDataFromPdf(String pdfPath){List datas=new ArrayList<>();String newPdfPath=pdfPath.replace(".pdf","_01.pdf");String htmlPath=pdfPath.replace(".pdf","_01.html");//确认附件表格所在的页面，返回页码int[] pageNums=readPdf(pdfPath);//读取存在表格附件的页面partitionPdfFile(pdfPath,newPdfPath,pageNums[0],pageNums[1]);byte[] bytes = getBytes(newPdfPath);try (BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(htmlPath)),"UTF-8"));){//加载PDF文档PDDocument document = PDDocument.load(bytes);PDFDomTree pdfDomTree = new PDFDomTree();pdfDomTree.writeText(document,out);datas=ParseHtml(htmlPath);} catch (Exception e) {e.printStackTrace();}finally {//删除缓存文件File pdf_01=new File(newPdfPath);if(pdf_01.exists()){pdf_01.delete();}File html_01=new File(htmlPath);if(html_01.exists()){html_01.delete();}}return datas;}/**** 读取pdf 确定内容所在页* @param pdfPath*/private static int[] readPdf(String pdfPath){int[] pageNums=new int[2];try {PdfReader reader = new PdfReader(pdfPath);int pageNum = reader.getNumberOfPages();boolean isGo=false;for(int i=1;i<=pageNum;i){String pageContent = PdfTextExtractor.getTextFromPage(reader, i);//读取第i页的文档内容if((pageContent.trim().length()>0&&pageContent.startsWith("附件"))){pageNums[0]=i;isGo=true;}if(isGo&&pageContent.trim().length()<50){pageNums[1]=i-1;//break;}}} catch (Exception e) {e.printStackTrace();}finally{}return pageNums;} /** * pdf 转换为html * @param html * @return * @throws IOException */private static List ParseHtml(String html) throws IOException {org.jsoup.nodes.Document document = Jsoup.parse(new File(html), "utf-8");Elements postItems = document.select("div.page");//循环处理每页List datas=new ArrayList<>();for (int i=0;i 下面是html的解析方式，通过边框定位，找到每一行每一列所处的位置一级在该位置所属的元素。 /** * 从第二行开始（去除标题行） * @param postItem * @param table_col * @param index * @return */private static List getRow(Element postItem,Elements postItems,Elements table_col,int index) {String top = (process(postItems.get(index).attr("style"), "top"));String bottom = (process(postItems.get(index1).attr("style"), "top"));Elements tables = postItem.select("[style*=top:]");List data = https://www.itzhengshu.com/pdf/new ArrayList<>();double dbottom = Double.parseDouble(bottom);double dtop = Double.parseDouble(top);boolean isGo = false;for (int iiy = 0; iiy < table_col.size() - 1; iiy) {StringBuilder sbs = new StringBuilder();for (Element spostItem : tables) {String top2 = (process(spostItem.attr("style"), "top"));double top2s = Double.parseDouble(top2);if (top2s > dtop && top2s < dbottom) {String left2 = (process(spostItem.attr("style"), "left"));double[] cols = getRowCol(table_col, iiy);double left2s = Double.parseDouble(left2);if (left2s > cols[0] && left2s < cols[1]) {sbs.append(spostItem.text());}}}if(sbs.length()==0) {data.add("-");}else{data.add(sbs.toString());}}return data;}/** * 定位列的位置 * @param table_col * @param index * @return */private static double[] getRowCol(Elements table_col,int index){StringBuilder sbd=new StringBuilder();String left=(process(table_col.get(index).attr("style"),"left"));String right=(process(table_col.get(index 1).attr("style"),"left"));return new double[]{Double.parseDouble(left),Double.parseDouble(right)};} /*** 读取html中样式的指定属性* @param style* @param extract* @return*/private static String process(String style,String extract) {if (style.contains(extract)) {style = style.substring(style.indexOf(extract ":"));style = style.substring(0, style.indexOf(";"));String attr = style.substring(style.indexOf(":")1);return (attr.substring(0,attr.length()-2));}return null;} pom配置【关于PDF文件表格提取实现】com.itextpdfitextpdf5.5.13org.jsoupjsoup1.12.1commons-iocommons-io2.5org.apache.pdfboxfontbox2.0.0com.itextpdf.toolxmlworker5.5.11org.apache.poiooxml-schemas1.1


相关经验推荐
如何将几个pdf文件进行合并？这几个方法可要好好学 
 pdf查看工具哪个好用？查看软件讲解 
 jQuery实现在线预览PDF文件 
 Vue+SpringBoot+pdf.js pdf.js实现pdf的预览与下载 
 PDF文本翻译神器！复制即可翻译，这个外语翻译软件火了 
 pdf文件加水印怎么加？两个方法不容错过 
 如何将英文PDF文件迅速翻译成中文？ 
 这款PDF阅读器1秒定位论文公式变量，不同页图文也能同屏看 | 开源 
 图片转pdf格式怎么弄？