您现在的位置:首页 >> 前端 >> 内容

把docx里面的文字,公式和图片转成html(方法教程)

时间:2018/2/6 14:30:46 点击:

  核心提示:把docx里面的文字,公式和图片转成html大体需要如下的方式把docx的文字转成poi的XWPFDocumentjavaInputStream is=new FileInputStream(d:\\...

把docx里面的文字,公式和图片转成html大体需要如下的方式

把docx的文字转成poi的XWPFDocument

java

InputStream is=new FileInputStream("d:\\1.docx");

XWPFDocument docx = new XWPFDocument(is);

得到内容的列表,包括XWPFParagraph和XWPFTable 通过BodyElementType区分

List eles = docx.getBodyElements();
for (IBodyElement e : eles) {
    if (e.getElementType().equals(BodyElementType.PARAGRAPH)){
        XWPFParagraph p = (XWPFParagraph) e;
        handleParagraph(e):
    }else if(e.getElementType().equals(BodyElementType.TABLE){
        handleTable(e);
    }

得到XWPFParagraph 后,通过如下两个方法得到XWPFParagraph里面的具体内容

List runs = p.getRuns();//文本和图片
List pics = run.getEmbeddedPictures();//得到所有图片
再把图片保存起来就可以了。
List oMathList = p.getCTP().getOMathList();//公式
//公式这个就复杂了CTOMath属于XmlObject形式的xml文件,属于OMML,要先转成MathML,再把MathML转成png,保存到硬盘上。
//把XmlObject转成MathML
private static String getMathML(XmlObject xmlObject) throws Exception {
        final String xslFile = "/cn/com/eduedu/jee/util/OMML2MML.XSL";
        StreamSource stylesource = new StreamSource(MSDocxUtils.class.getResourceAsStream(xslFile));
        Transformer transformer = TransformerFactory.newInstance().newTransformer(stylesource);
        Node node = xmlObject.getDomNode();

        DOMSource source = new DOMSource(node);
        StringWriter stringwriter = new StringWriter();
        StreamResult result = new StreamResult(stringwriter);
        transformer.setOutputProperty("omit-xml-declaration", "yes");
        transformer.transform(source, result);

        String mathML = stringwriter.toString();
        stringwriter.close();

        // The native OMML2MML.XSL transforms OMML into MathML as XML having special
        // name spaces.
        // We don't need this since we want using the MathML in HTML, not in XML.
        // So ideally we should changing the OMML2MML.XSL to not do so.
        // But to take this example as simple as possible, we are using replace to get
        // rid of the XML specialities.
        mathML = mathML.replaceAll("xmlns:m=\"https://schemas.openxmlformats.org/officeDocument/2006/math\"", "");
        mathML = mathML.replaceAll("xmlns:mml", "xmlns");
        mathML = mathML.replaceAll("mml:", "");
        return mathML;
    }
//MathML转成Document 
private static Document convertStringToDocument(String xmlStr) {
        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();  
        DocumentBuilder builder;  
        try  
        {  
            builder = factory.newDocumentBuilder();  
            Document doc = builder.parse( new InputSource( new StringReader( xmlStr ) ) ); 
            return doc;
        } catch (Exception e) {  
            e.printStackTrace();  
        } 
        return null;
    }
//最后的代码是这样的
private static String convertOmathToPng(XmlObject xmlObject,MSDocxToHtmlImageParser imageParser) throws Exception {
        pngNumber++;
        Document document=convertStringToDocument(getMathML(xmlObject));
        Converter mathMLConvert =Converter.getInstance();
        LayoutContextImpl localLayoutContextImpl = new LayoutContextImpl(LayoutContextImpl.getDefaultLayoutContext());
        localLayoutContextImpl.setParameter(Parameter.MATHSIZE, 18);
        ByteArrayOutputStream  os=new ByteArrayOutputStream();
        mathMLConvert.convert(document,os, "image/png", localLayoutContextImpl);
        String pngName=imageParser.parse(os.toByteArray(), "png_"+pngNumber+".png");
        os.close();
        return "把docx里面的文字,公式和图片转成html(方法教程)";
    }

用到的所有包

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.io.StringWriter;
import java.math.BigInteger;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;

import org.apache.poi.POIXMLProperties;
import org.apache.poi.xwpf.usermodel.BodyElementType;
import org.apache.poi.xwpf.usermodel.IBodyElement;
import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
import org.apache.poi.xwpf.usermodel.VerticalAlign;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFPicture;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.poi.xwpf.usermodel.XWPFStyles;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow;
import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.officeDocument.x2006.math.CTOMath;
import org.openxmlformats.schemas.officeDocument.x2006.math.CTOMathPara;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

import cn.com.eduedu.jee.util.wordnumber.IWordNumber;
import cn.com.eduedu.jee.util.wordnumber.WordNumberFactory;
import net.sourceforge.jeuclid.context.LayoutContextImpl;
import net.sourceforge.jeuclid.context.Parameter;
import net.sourceforge.jeuclid.converter.Converter;

Tags:把D DO OC CX 
作者:网络 来源:redlevin的专