Java method to convert Word/Pdf/TXT to HTML
一:Java实现将word转换为html
1:引入依赖
1 <dependency> 2 <groupId>fr.opensagres.xdocreport</groupId> 3 <artifactId>fr.opensagres.xdocreport.document</artifactId> 4 <version>1.0.5</version> 5 </dependency> 6 <dependency> 7 <groupId>fr.opensagres.xdocreport</groupId> 8 <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId> 9 <version>1.0.5</version> 10 </dependency> 11 <dependency> 12 <groupId>org.apache.poi</groupId> 13 <artifactId>poi</artifactId> 14 <version>3.12</version> 15 </dependency> 16 <dependency> 17 <groupId>org.apache.poi</groupId> 18 <artifactId>poi-scratchpad</artifactId> 19 <version>3.12</version> 20 </dependency>
2:代码demo
1 package com.svse.controller; 2 3 import javax.xml.parsers.DocumentBuilderFactory; 4 import javax.xml.parsers.ParserConfigurationException; 5 import javax.xml.transform.OutputKeys; 6 import javax.xml.transform.Transformer; 7 import javax.xml.transform.TransformerException; 8 import javax.xml.transform.TransformerFactory; 9 import javax.xml.transform.dom.DOMSource; 10 import javax.xml.transform.stream.StreamResult; 11 12 import org.apache.poi.hwpf.HWPFDocument; 13 import org.apache.poi.hwpf.converter.PicturesManager; 14 import org.apache.poi.hwpf.converter.WordToHtmlConverter; 15 import org.apache.poi.hwpf.usermodel.PictureType; 16 import org.apache.poi.xwpf.converter.core.BasicURIResolver; 17 import org.apache.poi.xwpf.converter.core.FileImageExtractor; 18 import org.apache.poi.xwpf.converter.core.FileURIResolver; 19 import org.apache.poi.xwpf.converter.core.IURIResolver; 20 import org.apache.poi.xwpf.converter.core.IXWPFConverter; 21 import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter; 22 import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions; 23 import org.apache.poi.xwpf.usermodel.XWPFDocument; 24 /** 25 * word 转换成html 26 */ 27 public class TestWordToHtml { 28 29 public static final String STORAGEPATH="C://works//files//"; 30 public static final String IP="192.168.30.222"; 31 public static final String PORT="8010"; 32 public static void main(String[] args) throws IOException, TransformerException, ParserConfigurationException { 33 TestWordToHtml wt=new TestWordToHtml(); 34 //wt.Word2003ToHtml("甲骨文考证.doc"); 35 wt.Word2007ToHtml("甲骨文考证.docx"); 36 37 } 38 39 /** 40 * 2003版本word转换成html 41 * @throws IOException 42 * @throws TransformerException 43 * @throws ParserConfigurationException 44 */ 45 public void Word2003ToHtml(String fileName) throws IOException, TransformerException, ParserConfigurationException { 46 47 final String imagepath = STORAGEPATH+"fileImage/";//解析时候如果doc文件中有图片 图片会保存在此路径 48 final String strRanString=getRandomNum(); 49 String filepath =STORAGEPATH; 50 String htmlName =fileName.substring(0, fileName.indexOf("."))+ "2003.html"; 51 final String file = filepath + fileName; 52 InputStream input = new FileInputStream(new File(file)); 53 HWPFDocument wordDocument = new HWPFDocument(input); 54 WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument()); 55 //设置图片存放的位置 56 wordToHtmlConverter.setPicturesManager(new PicturesManager() { 57 public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) { 58 File imgPath = new File(imagepath); 59 if(!imgPath.exists()){//图片目录不存在则创建 60 imgPath.mkdirs(); 61 } 62 63 File file = new File(imagepath +strRanString+suggestedName); 64 try { 65 OutputStream os = new FileOutputStream(file); 66 os.write(content); 67 os.close(); 68 } catch (FileNotFoundException e) { 69 e.printStackTrace(); 70 } catch (IOException e) { 71 e.printStackTrace(); 72 } 73 74 return "http://"+IP+":"+PORT+"//uploadFile/fileImage/"+strRanString+suggestedName; 75 // return imagepath +strRanString+suggestedName; 76 } 77 }); 78 79 //解析word文档 80 wordToHtmlConverter.processDocument(wordDocument); 81 Document htmlDocument = wordToHtmlConverter.getDocument(); 82 83 File htmlFile = new File(filepath +strRanString+htmlName); 84 OutputStream outStream = new FileOutputStream(htmlFile); 85 86 87 DOMSource domSource = new DOMSource(htmlDocument); 88 StreamResult streamResult = new StreamResult(outStream); 89 90 TransformerFactory factory = TransformerFactory.newInstance(); 91 Transformer serializer = factory.newTransformer(); 92 serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8"); 93 serializer.setOutputProperty(OutputKeys.INDENT, "yes"); 94 serializer.setOutputProperty(OutputKeys.METHOD, "html"); 95 96 serializer.transform(domSource, streamResult); 97 outStream.close(); 98 99 System.out.println("生成html文件路径:"+ "http://"+IP+":"+PORT+"//uploadFile/"+strRanString+htmlName); 100 } 101 102 /** 103 * 2007版本word转换成html 104 * @throws IOException 105 */ 106 public void Word2007ToHtml(String fileName) throws IOException { 107 108 final String strRanString=getRandomNum(); 109 110 String filepath = STORAGEPATH+strRanString; 111 String htmlName =fileName.substring(0, fileName.indexOf("."))+ "2007.html"; 112 File f = new File(STORAGEPATH+fileName); 113 if (!f.exists()) { 114 System.out.println("Sorry File does not Exists!"); 115 } else { 116 if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) { 117 try { 118 // 1) 加载word文档生成 XWPFDocument对象 119 InputStream in = new FileInputStream(f); 120 XWPFDocument document = new XWPFDocument(in); 121 122 // 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录) 123 File imageFolderFile = new File(filepath); 124 XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile)); 125 options.setExtractor(new FileImageExtractor(imageFolderFile)); 126 options.URIResolver(new IURIResolver() { 127 public String resolve(String uri) { 128 //http://192.168.30.222:8010//uploadFile/.... 129 return "http://"+IP+":"+PORT+"//uploadFile/"+strRanString +"/"+ uri; 130 } 131 }); 132 133 options.setIgnoreStylesIfUnused(false); 134 options.setFragment(true); 135 136 // 3) 将 XWPFDocument转换成XHTML 137 OutputStream out = new FileOutputStream(new File(filepath + htmlName)); 138 IXWPFConverter<XHTMLOptions> converter = XHTMLConverter.getInstance(); 139 converter.convert(document,out, options); 140 //XHTMLConverter.getInstance().convert(document, out, options); 141 System.out.println("html路径:"+"http://"+IP+":"+PORT+"//uploadFile/"+strRanString+htmlName); 142 } catch (Exception e) { 143 e.printStackTrace(); 144 } 145 146 } else { 147 System.out.println("Enter only MS Office 2007+ files"); 148 } 149 } 150 } 151 152 /** 153 *功能说明:生成时间戳 154 *创建人:zsq 155 *创建时间:2019年12月7日 下午2:37:09 156 * 157 */ 158 public static String getRandomNum(){ 159 Date dt = new Date(); 160 SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss"); 161 String str=sdf.format(dt); 162 return str; 163 } 164 165 }
二:Java实现将Pdf转换为html
1: 引入依赖
1 <dependency> 2 <groupId>net.sf.cssbox</groupId> 3 <artifactId>pdf2dom</artifactId> 4 <version>1.7</version> 5 </dependency> 6 <dependency> 7 <groupId>org.apache.pdfbox</groupId> 8 <artifactId>pdfbox</artifactId> 9 <version>2.0.12</version> 10 </dependency> 11 <dependency> 12 <groupId>org.apache.pdfbox</groupId> 13 <artifactId>pdfbox-tools</artifactId> 14 <version>2.0.12</version> 15 </dependency> 16
2:代码Demo
1 public class PdfToHtml { 2 3 /* 4 pdf转换html 5 */ 6 public void pdfToHtmlTest(String inPdfPath,String outputHtmlPath) { 7 // String outputPath = "C:\\works\\files\\ZSQ保密知识测试题库.html"; 8 9 //try() 写在()里面会自动关闭流 10 try{ 11 BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(outputHtmlPath)),"utf-8")); 12 //加载PDF文档 13 //PDDocument document = PDDocument.load(bytes); 14 PDDocument document = PDDocument.load(new File(inPdfPath)); 15 PDFDomTree pdfDomTree = new PDFDomTree(); 16 pdfDomTree.writeText(document,out); 17 } catch (Exception e) { 18 e.printStackTrace(); 19 } 20 } 21 22 public static void main(String[] args) throws IOException { 23 PdfToHtml ph=new PdfToHtml(); 24 String pdfPath="C:\\works\\files\\武研中心行政考勤制度.pdf"; 25 String outputPath="C:\\works\\files\\武研中心行政考勤制度.html"; 26 ph.pdfToHtmlTest(pdfPath,outputPath); 27 } 28 29 }
三:Java实现将TXT转换为html
1 /* 2 * txt文档转html 3 filePath:txt原文件路径 4 htmlPosition:转化后生成的html路径 5 */ 6 public static void txtToHtml(String filePath, String htmlPosition) { 7 try { 8 //String encoding = "GBK"; 9 File file = new File(filePath); 10 if (file.isFile() && file.exists()) { // 判断文件是否存在 11 InputStreamReader read = new InputStreamReader(new FileInputStream(file), "GBK"); 12 // 考虑到编码格式 13 BufferedReader bufferedReader = new BufferedReader(read); 14 // 写文件 15 FileOutputStream fos = new FileOutputStream(new File(htmlPosition)); 16 OutputStreamWriter osw = new OutputStreamWriter(fos, "GBK"); 17 BufferedWriter bw = new BufferedWriter(osw); 18 String lineTxt = null; 19 while ((lineTxt = bufferedReader.readLine()) != null) { 20 bw.write("   "+lineTxt + "</br>"); 21 } 22 bw.close(); 23 osw.close(); 24 fos.close(); 25 read.close(); 26 } else { 27 System.out.println("找不到指定的文件"); 28 } 29 } catch (Exception e) { 30 System.out.println("读取文件内容出错"); 31 e.printStackTrace(); 32 } 33 }
The above is the detailed content of Java method to convert Word/Pdf/TXT to HTML. For more information, please follow other related articles on the PHP Chinese website!

Hot AI Tools

Undresser.AI Undress
AI-powered app for creating realistic nude photos

AI Clothes Remover
Online AI tool for removing clothes from photos.

Undress AI Tool
Undress images for free

Clothoff.io
AI clothes remover

Video Face Swap
Swap faces in any video effortlessly with our completely free AI face swap tool!

Hot Article

Hot Tools

Notepad++7.3.1
Easy-to-use and free code editor

SublimeText3 Chinese version
Chinese version, very easy to use

Zend Studio 13.0.1
Powerful PHP integrated development environment

Dreamweaver CS6
Visual web development tools

SublimeText3 Mac version
God-level code editing software (SublimeText3)

Hot Topics

This tutorial demonstrates how to efficiently process XML documents using PHP. XML (eXtensible Markup Language) is a versatile text-based markup language designed for both human readability and machine parsing. It's commonly used for data storage an

Java 8 introduces the Stream API, providing a powerful and expressive way to process data collections. However, a common question when using Stream is: How to break or return from a forEach operation? Traditional loops allow for early interruption or return, but Stream's forEach method does not directly support this method. This article will explain the reasons and explore alternative methods for implementing premature termination in Stream processing systems. Further reading: Java Stream API improvements Understand Stream forEach The forEach method is a terminal operation that performs one operation on each element in the Stream. Its design intention is

HTML defines the web structure, CSS is responsible for style and layout, and JavaScript gives dynamic interaction. The three perform their duties in web development and jointly build a colorful website.

React combines JSX and HTML to improve user experience. 1) JSX embeds HTML to make development more intuitive. 2) The virtual DOM mechanism optimizes performance and reduces DOM operations. 3) Component-based management UI to improve maintainability. 4) State management and event processing enhance interactivity.

WebdevelopmentreliesonHTML,CSS,andJavaScript:1)HTMLstructurescontent,2)CSSstylesit,and3)JavaScriptaddsinteractivity,formingthebasisofmodernwebexperiences.

Capsules are three-dimensional geometric figures, composed of a cylinder and a hemisphere at both ends. The volume of the capsule can be calculated by adding the volume of the cylinder and the volume of the hemisphere at both ends. This tutorial will discuss how to calculate the volume of a given capsule in Java using different methods. Capsule volume formula The formula for capsule volume is as follows: Capsule volume = Cylindrical volume Volume Two hemisphere volume in, r: The radius of the hemisphere. h: The height of the cylinder (excluding the hemisphere). Example 1 enter Radius = 5 units Height = 10 units Output Volume = 1570.8 cubic units explain Calculate volume using formula: Volume = π × r2 × h (4

PHP and Python each have their own advantages, and the choice should be based on project requirements. 1.PHP is suitable for web development, with simple syntax and high execution efficiency. 2. Python is suitable for data science and machine learning, with concise syntax and rich libraries.

PHP is a scripting language widely used on the server side, especially suitable for web development. 1.PHP can embed HTML, process HTTP requests and responses, and supports a variety of databases. 2.PHP is used to generate dynamic web content, process form data, access databases, etc., with strong community support and open source resources. 3. PHP is an interpreted language, and the execution process includes lexical analysis, grammatical analysis, compilation and execution. 4.PHP can be combined with MySQL for advanced applications such as user registration systems. 5. When debugging PHP, you can use functions such as error_reporting() and var_dump(). 6. Optimize PHP code to use caching mechanisms, optimize database queries and use built-in functions. 7
