Home Java javaTutorial Java method to convert Word/Pdf/TXT to HTML

Java method to convert Word/Pdf/TXT to HTML

Apr 26, 2023 am 10:37 AM
java html

一:Java实现将word转换为html

   1:引入依赖

 1 <dependency>
 2   <groupId>fr.opensagres.xdocreport</groupId>
 3   <artifactId>fr.opensagres.xdocreport.document</artifactId>
 4   <version>1.0.5</version>
 5 </dependency>
 6 <dependency> 
 7   <groupId>fr.opensagres.xdocreport</groupId> 
 8   <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId> 
 9   <version>1.0.5</version> 
10 </dependency>
11   <dependency>
12   <groupId>org.apache.poi</groupId>
13   <artifactId>poi</artifactId>
14   <version>3.12</version>
15 </dependency>
16 <dependency>
17   <groupId>org.apache.poi</groupId>
18   <artifactId>poi-scratchpad</artifactId>
19   <version>3.12</version>
20 </dependency>
Copy after login

  2:代码demo

  1 package com.svse.controller;
  2 
  3 import javax.xml.parsers.DocumentBuilderFactory;
  4 import javax.xml.parsers.ParserConfigurationException;
  5 import javax.xml.transform.OutputKeys;
  6 import javax.xml.transform.Transformer;
  7 import javax.xml.transform.TransformerException;
  8 import javax.xml.transform.TransformerFactory;
  9 import javax.xml.transform.dom.DOMSource;
 10 import javax.xml.transform.stream.StreamResult;
 11 
 12 import org.apache.poi.hwpf.HWPFDocument;
 13 import org.apache.poi.hwpf.converter.PicturesManager;
 14 import org.apache.poi.hwpf.converter.WordToHtmlConverter;
 15 import org.apache.poi.hwpf.usermodel.PictureType;
 16 import org.apache.poi.xwpf.converter.core.BasicURIResolver;
 17 import org.apache.poi.xwpf.converter.core.FileImageExtractor;
 18 import org.apache.poi.xwpf.converter.core.FileURIResolver;
 19 import org.apache.poi.xwpf.converter.core.IURIResolver;
 20 import org.apache.poi.xwpf.converter.core.IXWPFConverter;
 21 import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
 22 import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
 23 import org.apache.poi.xwpf.usermodel.XWPFDocument;
 24 /**
 25  * word 转换成html
 26  */
 27 public class TestWordToHtml {
 28 
 29     public static  final String STORAGEPATH="C://works//files//";
 30     public static  final String IP="192.168.30.222";
 31     public static  final String PORT="8010";
 32     public static void main(String[] args) throws IOException, TransformerException, ParserConfigurationException {
 33         TestWordToHtml wt=new TestWordToHtml();
 34         //wt.Word2003ToHtml("甲骨文考证.doc");
 35         wt.Word2007ToHtml("甲骨文考证.docx");
 36 
 37     }
 38       
 39      /**
 40      * 2003版本word转换成html
 41      * @throws IOException
 42      * @throws TransformerException
 43      * @throws ParserConfigurationException
 44      */
 45     public void Word2003ToHtml(String fileName) throws IOException, TransformerException, ParserConfigurationException {
 46        
 47         final String imagepath = STORAGEPATH+"fileImage/";//解析时候如果doc文件中有图片  图片会保存在此路径
 48         final String strRanString=getRandomNum();
 49         String filepath =STORAGEPATH;
 50         String htmlName =fileName.substring(0, fileName.indexOf("."))+ "2003.html";
 51         final String file = filepath + fileName;
 52         InputStream input = new FileInputStream(new File(file));
 53         HWPFDocument wordDocument = new HWPFDocument(input);
 54         WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
 55         //设置图片存放的位置
 56         wordToHtmlConverter.setPicturesManager(new PicturesManager() {
 57             public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
 58                 File imgPath = new File(imagepath);
 59                 if(!imgPath.exists()){//图片目录不存在则创建
 60                     imgPath.mkdirs();
 61                 }
 62                 
 63                 File file = new File(imagepath +strRanString+suggestedName);
 64                 try {
 65                     OutputStream os = new FileOutputStream(file);
 66                     os.write(content);
 67                     os.close();
 68                 } catch (FileNotFoundException e) {
 69                     e.printStackTrace();
 70                 } catch (IOException e) {
 71                     e.printStackTrace();
 72                 }
 73                 
 74                 return  "http://"+IP+":"+PORT+"//uploadFile/fileImage/"+strRanString+suggestedName;
 75                // return imagepath +strRanString+suggestedName;
 76             }
 77         });
 78         
 79         //解析word文档
 80         wordToHtmlConverter.processDocument(wordDocument);
 81         Document htmlDocument = wordToHtmlConverter.getDocument();
 82         
 83         File htmlFile = new File(filepath +strRanString+htmlName);
 84         OutputStream outStream = new FileOutputStream(htmlFile);
 85         
 86 
 87         DOMSource domSource = new DOMSource(htmlDocument);
 88         StreamResult streamResult = new StreamResult(outStream);
 89 
 90         TransformerFactory factory = TransformerFactory.newInstance();
 91         Transformer serializer = factory.newTransformer();
 92         serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
 93         serializer.setOutputProperty(OutputKeys.INDENT, "yes");
 94         serializer.setOutputProperty(OutputKeys.METHOD, "html");
 95         
 96         serializer.transform(domSource, streamResult);
 97         outStream.close();
 98         
 99         System.out.println("生成html文件路径:"+ "http://"+IP+":"+PORT+"//uploadFile/"+strRanString+htmlName);
100     }
101 
102     /**
103      * 2007版本word转换成html
104      * @throws IOException
105      */
106     public void Word2007ToHtml(String fileName) throws IOException {
107         
108        final String strRanString=getRandomNum();
109         
110         String filepath = STORAGEPATH+strRanString;
111         String htmlName =fileName.substring(0, fileName.indexOf("."))+ "2007.html";
112         File f = new File(STORAGEPATH+fileName);  
113         if (!f.exists()) {  
114             System.out.println("Sorry File does not Exists!");  
115         } else {  
116             if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) {  
117                 try {
118                     // 1) 加载word文档生成 XWPFDocument对象  
119                     InputStream in = new FileInputStream(f);  
120                     XWPFDocument document = new XWPFDocument(in);  
121       
122                     // 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)  
123                     File imageFolderFile = new File(filepath);  
124                     XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile));  
125                     options.setExtractor(new FileImageExtractor(imageFolderFile));  
126                     options.URIResolver(new IURIResolver() {
127                         public String resolve(String uri) {
128                             //http://192.168.30.222:8010//uploadFile/....
129                             return "http://"+IP+":"+PORT+"//uploadFile/"+strRanString +"/"+ uri;
130                         }
131                     });
132                     
133                     options.setIgnoreStylesIfUnused(false);  
134                     options.setFragment(true);  
135                       
136                     // 3) 将 XWPFDocument转换成XHTML  
137                     OutputStream out = new FileOutputStream(new File(filepath + htmlName));  
138                     IXWPFConverter<XHTMLOptions> converter = XHTMLConverter.getInstance();
139                     converter.convert(document,out, options);
140                     //XHTMLConverter.getInstance().convert(document, out, options);  
141                     System.out.println("html路径:"+"http://"+IP+":"+PORT+"//uploadFile/"+strRanString+htmlName);
142                 } catch (Exception e) {
143                     e.printStackTrace();
144                 }
145             
146             } else {  
147                 System.out.println("Enter only MS Office 2007+ files");  
148             }  
149         }  
150     }  
151 
152      /**
153      *功能说明:生成时间戳
154      *创建人:zsq
155      *创建时间:2019年12月7日 下午2:37:09
156      *
157      */
158      public static String getRandomNum(){
159          Date dt = new Date();
160          SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");  
161          String str=sdf.format(dt);
162          return str;
163      }
164      
165    }
Copy after login

二:Java实现将Pdf转换为html

  1: 引入依赖

 1 <dependency>
 2             <groupId>net.sf.cssbox</groupId>
 3             <artifactId>pdf2dom</artifactId>
 4             <version>1.7</version>
 5         </dependency> 
 6         <dependency>
 7             <groupId>org.apache.pdfbox</groupId>
 8             <artifactId>pdfbox</artifactId>
 9             <version>2.0.12</version>
10         </dependency>
11         <dependency>
12             <groupId>org.apache.pdfbox</groupId>
13             <artifactId>pdfbox-tools</artifactId>
14             <version>2.0.12</version>
15  </dependency>
16
Copy after login

2:代码Demo

 1 public class PdfToHtml {
 2 
 3   /*
 4     pdf转换html
 5      */
 6     public void pdfToHtmlTest(String inPdfPath,String outputHtmlPath)  {
 7        // String outputPath = "C:\\works\\files\\ZSQ保密知识测试题库.html";
 8     9        //try() 写在()里面会自动关闭流
10         try{
11             BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(outputHtmlPath)),"utf-8"));
12             //加载PDF文档
13             //PDDocument document = PDDocument.load(bytes);
14             PDDocument document = PDDocument.load(new File(inPdfPath));
15             PDFDomTree pdfDomTree = new PDFDomTree();
16             pdfDomTree.writeText(document,out);
17         } catch (Exception e) {
18             e.printStackTrace();
19         }
20     }
21 
22     public static void main(String[] args) throws IOException {
23         PdfToHtml ph=new PdfToHtml();
24         String pdfPath="C:\\works\\files\\武研中心行政考勤制度.pdf";
25         String outputPath="C:\\works\\files\\武研中心行政考勤制度.html";
26         ph.pdfToHtmlTest(pdfPath,outputPath);
27   }
28 
29 }
Copy after login

三:Java实现将TXT转换为html

 1  /*
 2      * txt文档转html
 3        filePath:txt原文件路径
 4        htmlPosition:转化后生成的html路径
 5     */
 6     public static void txtToHtml(String filePath, String htmlPosition) {
 7         try {
 8             //String encoding = "GBK";
 9             File file = new File(filePath);
10             if (file.isFile() && file.exists()) { // 判断文件是否存在
11                 InputStreamReader read = new InputStreamReader(new FileInputStream(file), "GBK");
12                 // 考虑到编码格式
13                 BufferedReader bufferedReader = new BufferedReader(read);
14                 // 写文件
15                 FileOutputStream fos = new FileOutputStream(new File(htmlPosition));
16                 OutputStreamWriter osw = new OutputStreamWriter(fos, "GBK");
17                 BufferedWriter bw = new BufferedWriter(osw);
18                 String lineTxt = null;
19                 while ((lineTxt = bufferedReader.readLine()) != null) {
20                     bw.write("&nbsp&nbsp&nbsp"+lineTxt + "</br>");
21                 }
22                 bw.close();
23                 osw.close();
24                 fos.close();
25                 read.close();
26             } else {
27                 System.out.println("找不到指定的文件");
28             }
29         } catch (Exception e) {
30             System.out.println("读取文件内容出错");
31             e.printStackTrace();
32         }
33     }
Copy after login

The above is the detailed content of Java method to convert Word/Pdf/TXT to HTML. For more information, please follow other related articles on the PHP Chinese website!

Statement of this Website
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn

Hot AI Tools

Undresser.AI Undress

Undresser.AI Undress

AI-powered app for creating realistic nude photos

AI Clothes Remover

AI Clothes Remover

Online AI tool for removing clothes from photos.

Undress AI Tool

Undress AI Tool

Undress images for free

Clothoff.io

Clothoff.io

AI clothes remover

Video Face Swap

Video Face Swap

Swap faces in any video effortlessly with our completely free AI face swap tool!

Hot Tools

Notepad++7.3.1

Notepad++7.3.1

Easy-to-use and free code editor

SublimeText3 Chinese version

SublimeText3 Chinese version

Chinese version, very easy to use

Zend Studio 13.0.1

Zend Studio 13.0.1

Powerful PHP integrated development environment

Dreamweaver CS6

Dreamweaver CS6

Visual web development tools

SublimeText3 Mac version

SublimeText3 Mac version

God-level code editing software (SublimeText3)

How do you parse and process HTML/XML in PHP? How do you parse and process HTML/XML in PHP? Feb 07, 2025 am 11:57 AM

This tutorial demonstrates how to efficiently process XML documents using PHP. XML (eXtensible Markup Language) is a versatile text-based markup language designed for both human readability and machine parsing. It's commonly used for data storage an

Break or return from Java 8 stream forEach? Break or return from Java 8 stream forEach? Feb 07, 2025 pm 12:09 PM

Java 8 introduces the Stream API, providing a powerful and expressive way to process data collections. However, a common question when using Stream is: How to break or return from a forEach operation? Traditional loops allow for early interruption or return, but Stream's forEach method does not directly support this method. This article will explain the reasons and explore alternative methods for implementing premature termination in Stream processing systems. Further reading: Java Stream API improvements Understand Stream forEach The forEach method is a terminal operation that performs one operation on each element in the Stream. Its design intention is

The Roles of HTML, CSS, and JavaScript: Core Responsibilities The Roles of HTML, CSS, and JavaScript: Core Responsibilities Apr 08, 2025 pm 07:05 PM

HTML defines the web structure, CSS is responsible for style and layout, and JavaScript gives dynamic interaction. The three perform their duties in web development and jointly build a colorful website.

React's Role in HTML: Enhancing User Experience React's Role in HTML: Enhancing User Experience Apr 09, 2025 am 12:11 AM

React combines JSX and HTML to improve user experience. 1) JSX embeds HTML to make development more intuitive. 2) The virtual DOM mechanism optimizes performance and reduces DOM operations. 3) Component-based management UI to improve maintainability. 4) State management and event processing enhance interactivity.

Understanding HTML, CSS, and JavaScript: A Beginner's Guide Understanding HTML, CSS, and JavaScript: A Beginner's Guide Apr 12, 2025 am 12:02 AM

WebdevelopmentreliesonHTML,CSS,andJavaScript:1)HTMLstructurescontent,2)CSSstylesit,and3)JavaScriptaddsinteractivity,formingthebasisofmodernwebexperiences.

Java Program to Find the Volume of Capsule Java Program to Find the Volume of Capsule Feb 07, 2025 am 11:37 AM

Capsules are three-dimensional geometric figures, composed of a cylinder and a hemisphere at both ends. The volume of the capsule can be calculated by adding the volume of the cylinder and the volume of the hemisphere at both ends. This tutorial will discuss how to calculate the volume of a given capsule in Java using different methods. Capsule volume formula The formula for capsule volume is as follows: Capsule volume = Cylindrical volume Volume Two hemisphere volume in, r: The radius of the hemisphere. h: The height of the cylinder (excluding the hemisphere). Example 1 enter Radius = 5 units Height = 10 units Output Volume = 1570.8 cubic units explain Calculate volume using formula: Volume = π × r2 × h (4

PHP vs. Python: Understanding the Differences PHP vs. Python: Understanding the Differences Apr 11, 2025 am 12:15 AM

PHP and Python each have their own advantages, and the choice should be based on project requirements. 1.PHP is suitable for web development, with simple syntax and high execution efficiency. 2. Python is suitable for data science and machine learning, with concise syntax and rich libraries.

PHP: A Key Language for Web Development PHP: A Key Language for Web Development Apr 13, 2025 am 12:08 AM

PHP is a scripting language widely used on the server side, especially suitable for web development. 1.PHP can embed HTML, process HTTP requests and responses, and supports a variety of databases. 2.PHP is used to generate dynamic web content, process form data, access databases, etc., with strong community support and open source resources. 3. PHP is an interpreted language, and the execution process includes lexical analysis, grammatical analysis, compilation and execution. 4.PHP can be combined with MySQL for advanced applications such as user registration systems. 5. When debugging PHP, you can use functions such as error_reporting() and var_dump(). 6. Optimize PHP code to use caching mechanisms, optimize database queries and use built-in functions. 7

See all articles