Java实现word转HTML

Java word转html主要是为了word的在线浏览。不过转换可用性不是很好。我们先开始开发吧

第一步：引用Maven


		<!-- Word转HTML start -->
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-scratchpad</artifactId>
			<version>3.14</version>
		</dependency>

		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-ooxml</artifactId>
			<version>3.14</version>
		</dependency>

		<dependency>
			<groupId>fr.opensagres.xdocreport</groupId>
			<artifactId>xdocreport</artifactId>
			<version>1.0.6</version>
		</dependency>

		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-ooxml-schemas</artifactId>
			<version>3.14</version>
		</dependency>

		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>ooxml-schemas</artifactId>
			<version>1.3</version>
		</dependency>

		<dependency>
			<groupId>org.jsoup</groupId>
			<artifactId>jsoup</artifactId>
			<version>1.11.3</version>
		</dependency>
		<!-- Word转HTML end -->

第二步：java实现代码

原始版本；

package com.mmxpw.mmw.file.view.word;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.converter.core.BasicURIResolver;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;

public class WordToHtml {

    /**
     * 将word2003转换为html文件
     *
     * @param wordPath word文件路径
     * @param wordName word文件名称无后缀
     * @param suffix   word文件后缀
     * @param htmlPath html存储地址
     * @throws IOException
     * @throws TransformerException
     * @throws ParserConfigurationException
     */
    public static String Word2003ToHtml(String wordPath, String wordName, String suffix, String htmlPath)
            throws IOException, TransformerException, ParserConfigurationException {
        String htmlName = wordName + ".html";
        final String imagePath = htmlPath + "image" + File.separator;
        // 判断html文件是否存在
        File htmlFile = new File(htmlPath + htmlName);
        if (htmlFile.exists()) {
            return htmlFile.getAbsolutePath();
        }
        // 原word文档
        final String file = wordPath + File.separator + wordName + suffix;
        InputStream input = new FileInputStream(new File(file));
        HWPFDocument wordDocument = new HWPFDocument(input);
        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
        // 设置图片存放的位置
        wordToHtmlConverter.setPicturesManager(new PicturesManager() {
            public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches,
                                      float heightInches) {
                File imgPath = new File(imagePath);
                if (!imgPath.exists()) {// 图片目录不存在则创建
                    imgPath.mkdirs();
                }
                File file = new File(imagePath + suggestedName);
                try {
                    OutputStream os = new FileOutputStream(file);
                    os.write(content);
                    os.close();
                } catch (FileNotFoundException e) {
                    e.printStackTrace();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                // 图片在html文件上的路径 相对路径
                return "image/" + suggestedName;
            }
        });
        // 解析word文档
        wordToHtmlConverter.processDocument(wordDocument);
        Document htmlDocument = wordToHtmlConverter.getDocument();
        // 生成html文件上级文件夹
        File folder = new File(htmlPath);
        if (!folder.exists()) {
            folder.mkdirs();
        }
        OutputStream outStream = new FileOutputStream(htmlFile);
        DOMSource domSource = new DOMSource(htmlDocument);
        StreamResult streamResult = new StreamResult(outStream);
        TransformerFactory factory = TransformerFactory.newInstance();
        Transformer serializer = factory.newTransformer();
        serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");
        serializer.transform(domSource, streamResult);
        return htmlFile.getAbsolutePath();
    }
    /**
     *
     * 2007版本word转换成html
     *
     * @param wordPath  word文件路径
     * @param wordName word文件名称无后缀
     * @param suffix   word文件后缀
     * @param htmlPath html存储地址
     * @return
     * @throws IOException
     */
    public static String Word2007ToHtml(String wordPath, String wordName, String suffix, String htmlPath)
            throws IOException {
        String htmlName = wordName + ".html";
        String imagePath = htmlPath + "image" + File.separator;
        // 判断html文件是否存在
        File htmlFile = new File(htmlPath + htmlName);
        if (htmlFile.exists()) {
            return htmlFile.getAbsolutePath();
        }
        // word文件
        File wordFile = new File(wordPath + File.separator + wordName + suffix);
        // 1) 加载word文档生成 XWPFDocument对象
        InputStream in = new FileInputStream(wordFile);
        XWPFDocument document = new XWPFDocument(in);
        // 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)
        File imgFolder = new File(imagePath);
        XHTMLOptions options = XHTMLOptions.create();
        options.setExtractor(new FileImageExtractor(imgFolder));
        // html中图片的路径 相对路径
        options.URIResolver(new BasicURIResolver("image"));
        options.setIgnoreStylesIfUnused(false);
        options.setFragment(true);
        // 3) 将 XWPFDocument转换成XHTML
        // 生成html文件上级文件夹
        File folder = new File(htmlPath);
        if (!folder.exists()) {
            folder.mkdirs();
        }
        OutputStream out = new FileOutputStream(htmlFile);
        XHTMLConverter.getInstance().convert(document, out, options);
        return htmlFile.getAbsolutePath();
    }

    public static void main(String[] args) {
        try {
            //Word2003ToHtml("E:\\templates", "2019-11-26", ".doc", "E://templates/");
            Word2007ToHtml("E:\\templates", "OnLineWord", ".doc", "E://templates/");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }


}

benr本人业务定制版本：

package com.mmxpw.mmw.file.view.word;

import org.apache.commons.fileupload.FileItem;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.converter.core.BasicURIResolver;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.util.ArrayList;
import java.util.List;

/**
 * @Class WordToHtmlNew
 * @Version 1.0
 * @Date 创建时间：2019-12-02 09:29
 * @Direction 类说明   传入文件，自动识别2003版本或者07版本，然后解析成HTML，且自动解析里面的图片，然后对图片做本地化存储，然后返回HTML
 */
public class WordToHtmlNew {

    public final static List<String> FILE_TYPE = new ArrayList<>() ;

    static{
        FILE_TYPE.add( "doc" ) ;
        FILE_TYPE.add( "docx" ) ;
    }

    /***
     * 文件格式处理
     * @param fileItem
     * @return
     * @throws Exception
     */
    public static boolean fileTypeValidate( File fileItem ) throws Exception{
        String[] namePix = fileItem.getName().split("\\u002E");
        String suffix = namePix[namePix.length-1] ;
        if( !FILE_TYPE.contains( suffix )  ){
            throw new Exception( "您导入的文件格式错误，请导入word类型的文件." ) ;
        }else{
            return true ;
        }
    }

    /****
     * 判断文档的类型 本方法判断的是是否是新版
     * @param fileItem
     * @return
     */
    public static boolean isDocx( File fileItem ){
        String[] namePix = fileItem.getName().split("\\u002E");
        String suffix = namePix[namePix.length-1] ;
        if( suffix.toLowerCase().equals( "docx" )){
            return true ;
        }else{
            return false ;
        }
    }


    /****
     * 自动识别版本
     * 自动生成word内部的图片
     * 自动过滤其他的类型文件
     * 自动添加不通模块的图片前缀
     * @param file
     * @param iamgePrefix  在file的同级创建文件夹-image 提取word内部图片前缀为传入值：prefix
     * @param accessPath   html访问路径的url为：accessPath
     * @return
     */
    public static String WordAutoToHtml( File file , String iamgePrefix , String accessPath ) throws Exception {
        if( file == null ){
            throw new Exception( "您传入的文件为空." ) ;
        }else{
            if( !file.exists() ){
                throw new Exception( "您传入的文件不存在." ) ;
            }else{
                if ( fileTypeValidate( file ) ){
                    //1.1 开始提取文件名称，文件路径
                    String path = file.getParent() ;
                    String fileName = file.getName() ;
                    //切割文件名里面的信息
                    String prefix = fileName.substring( 0 , fileName.lastIndexOf(".")  );
                    String suffix = fileName.substring( fileName.lastIndexOf(".") + 1  );
                    /*String[] namePix = fileName.split("\\u002E");
                    String suffix = namePix[namePix.length-1] ;*/
                    //1.2 开始做文件识别，然后指向去2007版本的docx 还是去2003版本的doc
                    path = path + File.separator ;
                    if( isDocx( file ) ){
                        return Word2007ToHtml( file , path  , prefix  , path , iamgePrefix , accessPath  ) ;
                    }else{
                        return Word2003ToHtml( file , path , prefix , path , iamgePrefix , accessPath  ) ;
                    }
                }
            }
        }
        return null ;
    }



    /**
     * 将word2003转换为html文件
     *
     * @param wordFile word文件
     * @param wordPath word文件路径
     * @param wordName word文件名称无后缀
     * @param htmlPath html存储地址
     * @param prefix   图片存储前缀
     * @param accessPath  图片的相对路径访问地址
     * @throws IOException
     * @throws TransformerException
     * @throws ParserConfigurationException
     */
    public static String Word2003ToHtml(File wordFile , String wordPath, String wordName, String htmlPath
            ,final String prefix  ,final String accessPath)
            throws IOException, TransformerException, ParserConfigurationException {
        String htmlName = wordName + ".html";
        //1.1 判断html文件是否存在
        File htmlFile = new File(htmlPath  + htmlName);
        if (htmlFile.exists()) {
            return htmlFile.getAbsolutePath();
        }
        //1.2 原word文档 - 文件路径信息
        //final String file = wordPath + File.separator + wordName  + "." + suffix;
        //InputStream input = new FileInputStream( new File( wordFile ) );
        InputStream input = new FileInputStream( wordFile );
        //1.3 final String imagePath
        final String imagePath = wordPath + "image" ;

        HWPFDocument wordDocument = new HWPFDocument(input);
        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
        // 设置图片存放的位置
        wordToHtmlConverter.setPicturesManager(new PicturesManager() {
            public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches,
                                      float heightInches) {
                File imgPath = new File(imagePath);
                if (!imgPath.exists()) {// 图片目录不存在则创建
                    imgPath.mkdirs();
                }
                String imageFileName = prefix  + suggestedName ;
                File file = new File(imagePath + File.separator + imageFileName );
                try {
                    OutputStream os = new FileOutputStream(file);
                    os.write(content);
                    os.close();
                } catch (FileNotFoundException e) {
                    e.printStackTrace();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                // 图片在html文件上的路径 相对路径
                return accessPath + imageFileName;
            }
        });
        // 解析word文档
        wordToHtmlConverter.processDocument(wordDocument);
        Document htmlDocument = wordToHtmlConverter.getDocument();
        // 生成html文件上级文件夹
        File folder = new File(htmlPath);
        if (!folder.exists()) {
            folder.mkdirs();
        }
        OutputStream outStream = new FileOutputStream(htmlFile);
        DOMSource domSource = new DOMSource(htmlDocument);
        StreamResult streamResult = new StreamResult(outStream);
        TransformerFactory factory = TransformerFactory.newInstance();
        Transformer serializer = factory.newTransformer();
        serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");
        serializer.transform(domSource, streamResult);
        return htmlFile.getAbsolutePath();
    }

    /**
     * 2007版本word转换成html
     *
     * @param wordPath  word文件路径
     * @param wordName word文件名称无后缀
     * @param suffix   word文件后缀
     * @param htmlPath html存储地址
     * @param prefix   图片存储前缀
     * @param accessPath  图片的相对路径访问地址
     * @return
     * @throws IOException
     */
    public static String Word2007ToHtml( File wordFile , String wordPath, String wordName, String htmlPath
            ,final String prefix ,final String accessPath)
            throws IOException, TransformerException, ParserConfigurationException {
        String htmlName = wordName + ".html";
        //1.1) 拼接HTML文件地址、判断html文件是否存在
        File htmlFile = new File(htmlPath + htmlName);
        if (htmlFile.exists()) {
            return htmlFile.getAbsolutePath();
        }
        // word文件
        //File wordFile = new File(wordPath + File.separator + wordName + suffix);
        //1.2) 加载word文档生成 XWPFDocument对象
        InputStream in = new FileInputStream( wordFile );
        XWPFDocument document = new XWPFDocument( in );
        //1.3) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)
        final String imagePath = wordPath + "image" ;
        File imgFolder = new File( imagePath );
        XHTMLOptions options = XHTMLOptions.create();
        options.setExtractor(new FileImageExtractor(imgFolder));
        // html中图片的路径 相对路径
        options.URIResolver( new BasicURIResolver( accessPath ) );
        options.setIgnoreStylesIfUnused(false);
        options.setFragment(true);
        //1.4) 将 XWPFDocument转换成XHTML
        // 生成html文件上级文件夹
        File folder = new File(htmlPath);
        if (!folder.exists()) {
            folder.mkdirs();
        }
        OutputStream out = new FileOutputStream(htmlFile);
        XHTMLConverter.getInstance().convert(document, out, options);
        return htmlFile.getAbsolutePath();
    }

    public static void main(String[] args) {
        try {
            //Word2003ToHtml("E:\\templates", "2019-11-26", ".doc", "E://templates/");
            WordAutoToHtml( new File("E:\\templates\\OnLineWord.doc") , "image_" , "image/");
            //WordAutoToHtml( new File("E:\\templates\\OnLineWord.docx") , "image_" , "image/");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }


}

第三步：使用结果

2007版本：Word文档样式