Java word转html主要是为了word的在线浏览。不过转换可用性不是很好。我们先开始开发吧
第一步:引用Maven
<!-- Word转HTML start -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.14</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.14</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>xdocreport</artifactId>
<version>1.0.6</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>3.14</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>ooxml-schemas</artifactId>
<version>1.3</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<!-- Word转HTML end -->
第二步:java实现代码
原始版本;
package com.mmxpw.mmw.file.view.word;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.converter.core.BasicURIResolver;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.w3c.dom.Document;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
public class WordToHtml {
/**
* 将word2003转换为html文件
*
* @param wordPath word文件路径
* @param wordName word文件名称无后缀
* @param suffix word文件后缀
* @param htmlPath html存储地址
* @throws IOException
* @throws TransformerException
* @throws ParserConfigurationException
*/
public static String Word2003ToHtml(String wordPath, String wordName, String suffix, String htmlPath)
throws IOException, TransformerException, ParserConfigurationException {
String htmlName = wordName + ".html";
final String imagePath = htmlPath + "image" + File.separator;
// 判断html文件是否存在
File htmlFile = new File(htmlPath + htmlName);
if (htmlFile.exists()) {
return htmlFile.getAbsolutePath();
}
// 原word文档
final String file = wordPath + File.separator + wordName + suffix;
InputStream input = new FileInputStream(new File(file));
HWPFDocument wordDocument = new HWPFDocument(input);
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
// 设置图片存放的位置
wordToHtmlConverter.setPicturesManager(new PicturesManager() {
public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches,
float heightInches) {
File imgPath = new File(imagePath);
if (!imgPath.exists()) {// 图片目录不存在则创建
imgPath.mkdirs();
}
File file = new File(imagePath + suggestedName);
try {
OutputStream os = new FileOutputStream(file);
os.write(content);
os.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
// 图片在html文件上的路径 相对路径
return "image/" + suggestedName;
}
});
// 解析word文档
wordToHtmlConverter.processDocument(wordDocument);
Document htmlDocument = wordToHtmlConverter.getDocument();
// 生成html文件上级文件夹
File folder = new File(htmlPath);
if (!folder.exists()) {
folder.mkdirs();
}
OutputStream outStream = new FileOutputStream(htmlFile);
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(outStream);
TransformerFactory factory = TransformerFactory.newInstance();
Transformer serializer = factory.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
return htmlFile.getAbsolutePath();
}
/**
*
* 2007版本word转换成html
*
* @param wordPath word文件路径
* @param wordName word文件名称无后缀
* @param suffix word文件后缀
* @param htmlPath html存储地址
* @return
* @throws IOException
*/
public static String Word2007ToHtml(String wordPath, String wordName, String suffix, String htmlPath)
throws IOException {
String htmlName = wordName + ".html";
String imagePath = htmlPath + "image" + File.separator;
// 判断html文件是否存在
File htmlFile = new File(htmlPath + htmlName);
if (htmlFile.exists()) {
return htmlFile.getAbsolutePath();
}
// word文件
File wordFile = new File(wordPath + File.separator + wordName + suffix);
// 1) 加载word文档生成 XWPFDocument对象
InputStream in = new FileInputStream(wordFile);
XWPFDocument document = new XWPFDocument(in);
// 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)
File imgFolder = new File(imagePath);
XHTMLOptions options = XHTMLOptions.create();
options.setExtractor(new FileImageExtractor(imgFolder));
// html中图片的路径 相对路径
options.URIResolver(new BasicURIResolver("image"));
options.setIgnoreStylesIfUnused(false);
options.setFragment(true);
// 3) 将 XWPFDocument转换成XHTML
// 生成html文件上级文件夹
File folder = new File(htmlPath);
if (!folder.exists()) {
folder.mkdirs();
}
OutputStream out = new FileOutputStream(htmlFile);
XHTMLConverter.getInstance().convert(document, out, options);
return htmlFile.getAbsolutePath();
}
public static void main(String[] args) {
try {
//Word2003ToHtml("E:\\templates", "2019-11-26", ".doc", "E://templates/");
Word2007ToHtml("E:\\templates", "OnLineWord", ".doc", "E://templates/");
} catch (Exception e) {
e.printStackTrace();
}
}
}
benr本人业务定制版本:
package com.mmxpw.mmw.file.view.word;
import org.apache.commons.fileupload.FileItem;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.converter.core.BasicURIResolver;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.w3c.dom.Document;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
/**
* @Class WordToHtmlNew
* @Version 1.0
* @Date 创建时间:2019-12-02 09:29
* @Direction 类说明 传入文件,自动识别2003版本或者07版本,然后解析成HTML,且自动解析里面的图片,然后对图片做本地化存储,然后返回HTML
*/
public class WordToHtmlNew {
public final static List<String> FILE_TYPE = new ArrayList<>() ;
static{
FILE_TYPE.add( "doc" ) ;
FILE_TYPE.add( "docx" ) ;
}
/***
* 文件格式处理
* @param fileItem
* @return
* @throws Exception
*/
public static boolean fileTypeValidate( File fileItem ) throws Exception{
String[] namePix = fileItem.getName().split("\\u002E");
String suffix = namePix[namePix.length-1] ;
if( !FILE_TYPE.contains( suffix ) ){
throw new Exception( "您导入的文件格式错误,请导入word类型的文件." ) ;
}else{
return true ;
}
}
/****
* 判断文档的类型 本方法判断的是是否是新版
* @param fileItem
* @return
*/
public static boolean isDocx( File fileItem ){
String[] namePix = fileItem.getName().split("\\u002E");
String suffix = namePix[namePix.length-1] ;
if( suffix.toLowerCase().equals( "docx" )){
return true ;
}else{
return false ;
}
}
/****
* 自动识别版本
* 自动生成word内部的图片
* 自动过滤其他的类型文件
* 自动添加不通模块的图片前缀
* @param file
* @param iamgePrefix 在file的同级创建文件夹-image 提取word内部图片前缀为传入值:prefix
* @param accessPath html访问路径的url为:accessPath
* @return
*/
public static String WordAutoToHtml( File file , String iamgePrefix , String accessPath ) throws Exception {
if( file == null ){
throw new Exception( "您传入的文件为空." ) ;
}else{
if( !file.exists() ){
throw new Exception( "您传入的文件不存在." ) ;
}else{
if ( fileTypeValidate( file ) ){
//1.1 开始提取文件名称,文件路径
String path = file.getParent() ;
String fileName = file.getName() ;
//切割文件名里面的信息
String prefix = fileName.substring( 0 , fileName.lastIndexOf(".") );
String suffix = fileName.substring( fileName.lastIndexOf(".") + 1 );
/*String[] namePix = fileName.split("\\u002E");
String suffix = namePix[namePix.length-1] ;*/
//1.2 开始做文件识别,然后指向去2007版本的docx 还是去2003版本的doc
path = path + File.separator ;
if( isDocx( file ) ){
return Word2007ToHtml( file , path , prefix , path , iamgePrefix , accessPath ) ;
}else{
return Word2003ToHtml( file , path , prefix , path , iamgePrefix , accessPath ) ;
}
}
}
}
return null ;
}
/**
* 将word2003转换为html文件
*
* @param wordFile word文件
* @param wordPath word文件路径
* @param wordName word文件名称无后缀
* @param htmlPath html存储地址
* @param prefix 图片存储前缀
* @param accessPath 图片的相对路径访问地址
* @throws IOException
* @throws TransformerException
* @throws ParserConfigurationException
*/
public static String Word2003ToHtml(File wordFile , String wordPath, String wordName, String htmlPath
,final String prefix ,final String accessPath)
throws IOException, TransformerException, ParserConfigurationException {
String htmlName = wordName + ".html";
//1.1 判断html文件是否存在
File htmlFile = new File(htmlPath + htmlName);
if (htmlFile.exists()) {
return htmlFile.getAbsolutePath();
}
//1.2 原word文档 - 文件路径信息
//final String file = wordPath + File.separator + wordName + "." + suffix;
//InputStream input = new FileInputStream( new File( wordFile ) );
InputStream input = new FileInputStream( wordFile );
//1.3 final String imagePath
final String imagePath = wordPath + "image" ;
HWPFDocument wordDocument = new HWPFDocument(input);
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
// 设置图片存放的位置
wordToHtmlConverter.setPicturesManager(new PicturesManager() {
public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches,
float heightInches) {
File imgPath = new File(imagePath);
if (!imgPath.exists()) {// 图片目录不存在则创建
imgPath.mkdirs();
}
String imageFileName = prefix + suggestedName ;
File file = new File(imagePath + File.separator + imageFileName );
try {
OutputStream os = new FileOutputStream(file);
os.write(content);
os.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
// 图片在html文件上的路径 相对路径
return accessPath + imageFileName;
}
});
// 解析word文档
wordToHtmlConverter.processDocument(wordDocument);
Document htmlDocument = wordToHtmlConverter.getDocument();
// 生成html文件上级文件夹
File folder = new File(htmlPath);
if (!folder.exists()) {
folder.mkdirs();
}
OutputStream outStream = new FileOutputStream(htmlFile);
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(outStream);
TransformerFactory factory = TransformerFactory.newInstance();
Transformer serializer = factory.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
return htmlFile.getAbsolutePath();
}
/**
* 2007版本word转换成html
*
* @param wordPath word文件路径
* @param wordName word文件名称无后缀
* @param suffix word文件后缀
* @param htmlPath html存储地址
* @param prefix 图片存储前缀
* @param accessPath 图片的相对路径访问地址
* @return
* @throws IOException
*/
public static String Word2007ToHtml( File wordFile , String wordPath, String wordName, String htmlPath
,final String prefix ,final String accessPath)
throws IOException, TransformerException, ParserConfigurationException {
String htmlName = wordName + ".html";
//1.1) 拼接HTML文件地址、判断html文件是否存在
File htmlFile = new File(htmlPath + htmlName);
if (htmlFile.exists()) {
return htmlFile.getAbsolutePath();
}
// word文件
//File wordFile = new File(wordPath + File.separator + wordName + suffix);
//1.2) 加载word文档生成 XWPFDocument对象
InputStream in = new FileInputStream( wordFile );
XWPFDocument document = new XWPFDocument( in );
//1.3) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)
final String imagePath = wordPath + "image" ;
File imgFolder = new File( imagePath );
XHTMLOptions options = XHTMLOptions.create();
options.setExtractor(new FileImageExtractor(imgFolder));
// html中图片的路径 相对路径
options.URIResolver( new BasicURIResolver( accessPath ) );
options.setIgnoreStylesIfUnused(false);
options.setFragment(true);
//1.4) 将 XWPFDocument转换成XHTML
// 生成html文件上级文件夹
File folder = new File(htmlPath);
if (!folder.exists()) {
folder.mkdirs();
}
OutputStream out = new FileOutputStream(htmlFile);
XHTMLConverter.getInstance().convert(document, out, options);
return htmlFile.getAbsolutePath();
}
public static void main(String[] args) {
try {
//Word2003ToHtml("E:\\templates", "2019-11-26", ".doc", "E://templates/");
WordAutoToHtml( new File("E:\\templates\\OnLineWord.doc") , "image_" , "image/");
//WordAutoToHtml( new File("E:\\templates\\OnLineWord.docx") , "image_" , "image/");
} catch (Exception e) {
e.printStackTrace();
}
}
}
第三步:使用结果
2007版本:Word文档样式
实际转换结果为:图片的居中,字体等信息自动换行效果不理想。
2003版本的Word文档也有上述问题,大家使用此技术时请注意此问题。
参考来源:Java实现word转HTML
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
文章由极客之音整理,本文链接:https://www.bmabk.com/index.php/post/160887.html