JAVA读取（DOC、DOCX、PDF、PPT、PPTX）文件文本内容及图片

以下为瞎扯淡：

温馨提示：有很多方法均可以解析这些常见的文件，以下内容使用的是apache-poi + apache-pdfbox实现的。

关于文档解析，在网上搜索了很久，无奈内容太过繁杂，找不到合适的代码，一大半都是只支持文本。没办法，只能自己在网上一点一点CV了，最终提取了这些代码，不能说好用吧，应该可解燃眉之急。关于doc文档以及pdf文档还是有很多问题的，后续希望大佬们能在帖子下面多多指正，能优化一下代码，那就更好了。

以下为正文内容：

首先把以下这些依赖干进去

org.apache.poipoi4.1.0org.apache.poipoi-scratchpad4.1.0org.apache.pdfboxpdfbox2.0.22

要测试的话给你贴一个文档地址吧：（但是这个在线文档是没有图片滴）

public static void main(String[] args) throws IOException {String document = processDocumentFromFilePath("E:\\VPN系统使用手册.pptx", "E:\\临时图片");System.out.println(document);String documentFromUrl = processDocumentFromUrl("http://api.idocv.com/data/doc/manual.docx", "E:\\临时图片");System.out.println(documentFromUrl);}

然后上车：飕飕飕

import com.alibaba.dubbo.common.utils.CollectionUtils;import org.apache.http.HttpResponse;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.HttpClients;import org.apache.pdfbox.pdmodel.PDDocument;import org.apache.pdfbox.text.PDFTextStripper;import org.apache.poi.hslf.usermodel.*;import org.apache.poi.hwpf.HWPFDocument;import org.apache.poi.hwpf.extractor.WordExtractor;import org.apache.poi.hwpf.usermodel.Picture;import org.apache.poi.sl.usermodel.TextParagraph;import org.apache.poi.xslf.usermodel.*;import org.apache.poi.xwpf.usermodel.*;import java.io.*;import java.util.Date;import java.util.List;import java.util.stream.Collectors;public class FileProcessorUtils { /*** * 此方法针对本地文件 * 提取文件信息并返回内容 * @param filePath 文件储存地址 * @param imgRoot 图片存储地址 * @return */public static String processDocumentFromFilePath(String filePath,String imgRoot) throws IOException {File file = new File(filePath);FileInputStream fileInputStream = new FileInputStream(file);// 根据文件类型调用适当的处理方法switch (fileTypeName(filePath)) {case "doc":return processWordDocDocumentFromStream(fileInputStream,imgRoot);case "docx":return processWordDocxDocumentFromStream(fileInputStream,imgRoot);case "pdf":return processPdfDocumentFromStream(fileInputStream,imgRoot);case "ppt":return processPptDocumentFromStream(fileInputStream,imgRoot);case "pptx":return processPptxDocumentFromStream(fileInputStream,imgRoot);default:throw new RuntimeException("不支持的文件格式,文件解析目前只支持(DOC/DOCX/PDF/PPT/PPTX)");}}/*** * 此方法针对网络文件 * 提取文件信息并返回内容 * @param downloadUrl 文件下载链接 * @param imgRoot 图片存储地址 * @return */public static String processDocumentFromUrl(String downloadUrl,String imgRoot) throws IOException {HttpClient httpClient = HttpClients.createDefault();HttpGet httpGet = new HttpGet(downloadUrl);HttpResponse response = httpClient.execute(httpGet);//获取文件类型// TODO: 2023/9/14此处并不是所有的下载链接都存在后缀信息,如果为了提升代码的健壮性，可以在此处修改代码以获取文件类型String typeName = fileTypeName(downloadUrl);// 根据文件类型调用适当的处理方法switch (typeName) {case "doc":return processWordDocDocumentFromStream(response.getEntity().getContent(),imgRoot);case "docx":return processWordDocxDocumentFromStream(response.getEntity().getContent(),imgRoot);case "pdf":return processPdfDocumentFromStream(response.getEntity().getContent(),imgRoot);case "ppt":return processPptDocumentFromStream(response.getEntity().getContent(),imgRoot);case "pptx":return processPptxDocumentFromStream(response.getEntity().getContent(),imgRoot);default:throw new RuntimeException("不支持的文件格式,文件解析目前只支持(DOC/DOCX/PDF/PPT/PPTX)");}}/*** * word(doc)文件处理 * @param inputStream(文件流) * @return */private static String processWordDocDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException {HWPFDocument document = new HWPFDocument(inputStream);StringBuilder htmlText = new StringBuilder();WordExtractor extractor = new WordExtractor(document);try {String[] paragraphs = extractor.getParagraphText();for (int paragraphIndex = 0; paragraphIndex < paragraphs.length; paragraphIndex++) {String paragraphText = paragraphs[paragraphIndex];//获取文本对齐方式String justification = getJustification(document.getRange().getParagraph(paragraphIndex).getJustification());// 根据需要添加其他HTML标签htmlText.append("").append(paragraphText).append("").append("
");}// 提取图片List pictures = document.getPicturesTable().getAllPictures();for (int i = 0; i < pictures.size(); i++) {Picture picture = pictures.get(i);byte[] pictureData = picture.getContent();String newFileName = new Date().getTime() + i + "_image." + picture.suggestFileExtension(); // 可以根据需要更改扩展名,suggestFileExtension()方法自动获取合适的图片类型String imgPath = saveImageToFile(pictureData, newFileName, imageRoot);htmlText.append("
");}} finally {extractor.close();document.close();}return htmlText.toString();}/*** * word(docx)文件处理 * @param inputStream(文件流) * @return */private static String processWordDocxDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException {//获取文件内容XWPFDocument document = new XWPFDocument(inputStream);StringBuilder htmlText = new StringBuilder();try {//获取所有元素List paragraphs = document.getParagraphs();//根据元素类型追加for (XWPFParagraph paragraph : paragraphs) {//获取文本对齐方式ParagraphAlignment alignment = paragraph.getAlignment();htmlText.append("");List runs = paragraph.getRuns();for (XWPFRun run : runs) {// 处理字体大小、样式等信息String fontSize = run.getFontSize() + "pt";String fontFamily = run.getFontFamily();// 添加样式信息到HTMLhtmlText.append("" + run.text() + "");}htmlText.append("
");// 检查当前行段落是否有图片存在List pictures = paragraph.getRuns().stream().flatMap(run -> run.getEmbeddedPictures().stream()).collect(Collectors.toList());if(CollectionUtils.isNotEmpty(pictures)){if(pictures.size()>0){pictures.forEach( bean ->{XWPFPictureData pictureData = bean.getPictureData();String newFileName = new Date().getTime() + "_image." + pictureData.suggestFileExtension();String imgPath = null;try {imgPath = saveImageToFile(pictureData.getData(), newFileName, imageRoot);} catch (IOException e) {throw new RuntimeException(e);}htmlText.append("
");});}}}} finally {document.close();}return htmlText.toString();}/*** * Pdf文件处理 * @param inputStream(文件流) * @return */private static String processPdfDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException {PDDocument pdfDocument = PDDocument.load(inputStream);PDFTextStripper textStripper = new PDFTextStripper();StringBuilder htmlText = new StringBuilder();String[] lines = textStripper.getText(pdfDocument).split("\n");for (String line : lines) {htmlText.append("").append(line).append("
");}pdfDocument.close();return htmlText.toString();}/** * 处理PPT（.ppt）文件 * @param inputStream（文件流） * @return * @throws IOException */private static String processPptDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException {HSLFSlideShow ppt = new HSLFSlideShow(inputStream);StringBuilder pptText = new StringBuilder();try {// 提取文本内容for (HSLFSlide slide : ppt.getSlides()) {for (HSLFShape shape : slide.getShapes()) {//如果是文本处理文本if (shape instanceof HSLFTextShape) {HSLFTextShape textShape = (HSLFTextShape) shape;for (HSLFTextParagraph paragraph : textShape.getTextParagraphs()) {//获取文本对齐方式TextParagraph.TextAlign textAlign = paragraph.getTextAlign();pptText.append("");for (HSLFTextRun run : paragraph.getTextRuns()) {// 处理字体大小、字体样式等信息String fontSize = run.getFontSize() + "pt";String fontFamily = run.getFontFamily();run.getRawText();// 添加样式信息到HTMLpptText.append("" + run.getRawText() + "");}pptText.append("
"); // 换行处理}}else if (shape instanceof HSLFPictureShape) {// 如果是图片，处理图片HSLFPictureShape pictureShape = (HSLFPictureShape) shape;HSLFPictureData pictureData = pictureShape.getPictureData();String contentType = pictureData.getContentType();String newFileName = new Date().getTime() + "_image." + imageTypeName(contentType);String imgPath = saveImageToFile(pictureData.getData(), newFileName, imageRoot);pptText.append("
");}}}} finally {ppt.close();}return pptText.toString();}/** * 处理PPTX（.pptx）文件 * @param inputStream（文件流） * @return * @throws IOException */private static String processPptxDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException {XMLSlideShow pptx = new XMLSlideShow(inputStream);StringBuilder pptxText = new StringBuilder();try {// 提取文本内容for (XSLFSlide slide : pptx.getSlides()) {for (XSLFShape shape : slide.getShapes()) {if (shape instanceof XSLFTextShape) {XSLFTextShape textShape = (XSLFTextShape) shape;for (XSLFTextParagraph paragraph : textShape.getTextParagraphs()) {//获取文本对齐方式TextParagraph.TextAlign textAlign = paragraph.getTextAlign();pptxText.append("");for (XSLFTextRun run : paragraph.getTextRuns()) {// 处理字体大小、字体样式等信息String fontSize = run.getFontSize() + "pt";String fontFamily = run.getFontFamily();// 添加样式信息到HTMLpptxText.append("" + run.getRawText() + "");}pptxText.append("
"); // 换行处理}}else if (shape instanceof XSLFPictureShape) {// 如果是图片，处理图片XSLFPictureShape pictureShape = (XSLFPictureShape) shape;XSLFPictureData pictureData = pictureShape.getPictureData();String newFileName = new Date().getTime() + "_image." + pictureData.suggestFileExtension();String imgPath = saveImageToFile(pictureData.getData(), newFileName, imageRoot);pptxText.append("
");}}}} finally {pptx.close();}return pptxText.toString();}/** * 保存图片到指定位置，并返回引用地址 * @param imageData * @param imageRoot * @return * @throws IOException */public static String saveImageToFile(byte[] imageData, String imageFileName, String imageRoot) throws IOException {String imagePath = imageRoot + File.separator + imageFileName;File file = new File(imageRoot);if(!file.exists()){file.mkdir();}try (FileOutputStream fos = new FileOutputStream(imagePath)) {fos.write(imageData);}return imagePath;}/** * 表格处理 * @param table * @return */private static String getTableHtmlText(XWPFTable table) {StringBuilder tableHtml = new StringBuilder("");for (XWPFTableRow row : table.getRows()) {tableHtml.append("");for (XWPFTableCell cell : row.getTableCells()) {tableHtml.append("");}tableHtml.append("");}tableHtml.append("").append(cell.getText()).append("
");return tableHtml.toString();}/*** * 获取文件后缀 * @param filePath * @return */private static String fileTypeName(String filePath) {int dotIndex = filePath.lastIndexOf(".");if (dotIndex > 0) {return filePath.substring(dotIndex + 1).toLowerCase();}return "";}/*** * 获取图片类型 * @param imagePath * @return */private static String imageTypeName(String imagePath) {int dotIndex = imagePath.lastIndexOf("/");if (dotIndex > 0) {return imagePath.substring(dotIndex + 1).toLowerCase();}return "";}/*** * doc文档获取当前行对齐方式 默认左对齐 * @param type * @return */private static String getJustification(Integer type) {switch (type) {case 0:return "left";case 1:return "center";case 2:return "right";default:return "left";}}}

JAVA读取（DOC、DOCX、PDF、PPT、PPTX）文件文本内容及图片

以下为瞎扯淡：

温馨提示：有很多方法均可以解析这些常见的文件，以下内容使用的是apache-poi + apache-pdfbox实现的。

以下为正文内容：

首先把以下这些依赖干进去

要测试的话给你贴一个文档地址吧：（但是这个在线文档是没有图片滴）

然后上车：飕飕飕

最新关注

热文推荐

KubeSphere 社区双周报 | Fluent Operator 2.6.0 发布 | 2023.11.10-11.23

Android架构组件LiveData

GreatSQL社区与Amazon、Facebook、Tencent共同被MySQL致谢

合并PDF（将多个pdf文件整合成一个pdf文件）

认识HTTPS以及了解HTTPS的加密过程

安装Node.js和cnpm

JAVA读取（DOC、DOCX、PDF、PPT、PPTX）文件文本内容及图片

以下为瞎扯淡：

温馨提示：有很多方法均可以解析这些常见的文件，以下内容使用的是apache-poi + apache-pdfbox实现的。

以下为正文内容：

首先把以下这些依赖干进去

要测试的话给你贴一个文档地址吧：（但是这个在线文档是没有图片滴）

然后上车：飕飕飕

相关文章

最新关注

热文推荐