核心提示:把docx里面的文字,公式和图片转成html大体需要如下的方式把docx的文字转成poi的XWPFDocumentjavaInputStream is=new FileInputStream(d:\\...
把docx里面的文字,公式和图片转成html大体需要如下的方式
把docx的文字转成poi的XWPFDocument
java
InputStream is=new FileInputStream("d:\\1.docx");
XWPFDocument docx = new XWPFDocument(is);
得到内容的列表,包括XWPFParagraph和XWPFTable 通过BodyElementType区分
Listeles = docx.getBodyElements(); for (IBodyElement e : eles) { if (e.getElementType().equals(BodyElementType.PARAGRAPH)){ XWPFParagraph p = (XWPFParagraph) e; handleParagraph(e): }else if(e.getElementType().equals(BodyElementType.TABLE){ handleTable(e); }
得到XWPFParagraph 后,通过如下两个方法得到XWPFParagraph里面的具体内容
Listruns = p.getRuns();//文本和图片 List pics = run.getEmbeddedPictures();//得到所有图片 再把图片保存起来就可以了。 List oMathList = p.getCTP().getOMathList();//公式 //公式这个就复杂了CTOMath属于XmlObject形式的xml文件,属于OMML,要先转成MathML,再把MathML转成png,保存到硬盘上。 //把XmlObject转成MathML private static String getMathML(XmlObject xmlObject) throws Exception { final String xslFile = "/cn/com/eduedu/jee/util/OMML2MML.XSL"; StreamSource stylesource = new StreamSource(MSDocxUtils.class.getResourceAsStream(xslFile)); Transformer transformer = TransformerFactory.newInstance().newTransformer(stylesource); Node node = xmlObject.getDomNode(); DOMSource source = new DOMSource(node); StringWriter stringwriter = new StringWriter(); StreamResult result = new StreamResult(stringwriter); transformer.setOutputProperty("omit-xml-declaration", "yes"); transformer.transform(source, result); String mathML = stringwriter.toString(); stringwriter.close(); // The native OMML2MML.XSL transforms OMML into MathML as XML having special // name spaces. // We don't need this since we want using the MathML in HTML, not in XML. // So ideally we should changing the OMML2MML.XSL to not do so. // But to take this example as simple as possible, we are using replace to get // rid of the XML specialities. mathML = mathML.replaceAll("xmlns:m=\"https://schemas.openxmlformats.org/officeDocument/2006/math\"", ""); mathML = mathML.replaceAll("xmlns:mml", "xmlns"); mathML = mathML.replaceAll("mml:", ""); return mathML; } //MathML转成Document private static Document convertStringToDocument(String xmlStr) { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder; try { builder = factory.newDocumentBuilder(); Document doc = builder.parse( new InputSource( new StringReader( xmlStr ) ) ); return doc; } catch (Exception e) { e.printStackTrace(); } return null; } //最后的代码是这样的 private static String convertOmathToPng(XmlObject xmlObject,MSDocxToHtmlImageParser imageParser) throws Exception { pngNumber++; Document document=convertStringToDocument(getMathML(xmlObject)); Converter mathMLConvert =Converter.getInstance(); LayoutContextImpl localLayoutContextImpl = new LayoutContextImpl(LayoutContextImpl.getDefaultLayoutContext()); localLayoutContextImpl.setParameter(Parameter.MATHSIZE, 18); ByteArrayOutputStream os=new ByteArrayOutputStream(); mathMLConvert.convert(document,os, "image/png", localLayoutContextImpl); String pngName=imageParser.parse(os.toByteArray(), "png_"+pngNumber+".png"); os.close(); return " "; }
用到的所有包
import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.StringReader; import java.io.StringWriter; import java.math.BigInteger; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamSource; import org.apache.poi.POIXMLProperties; import org.apache.poi.xwpf.usermodel.BodyElementType; import org.apache.poi.xwpf.usermodel.IBodyElement; import org.apache.poi.xwpf.usermodel.UnderlinePatterns; import org.apache.poi.xwpf.usermodel.VerticalAlign; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xwpf.usermodel.XWPFPicture; import org.apache.poi.xwpf.usermodel.XWPFRun; import org.apache.poi.xwpf.usermodel.XWPFStyles; import org.apache.poi.xwpf.usermodel.XWPFTable; import org.apache.poi.xwpf.usermodel.XWPFTableCell; import org.apache.poi.xwpf.usermodel.XWPFTableRow; import org.apache.xmlbeans.XmlObject; import org.openxmlformats.schemas.officeDocument.x2006.math.CTOMath; import org.openxmlformats.schemas.officeDocument.x2006.math.CTOMathPara; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import cn.com.eduedu.jee.util.wordnumber.IWordNumber; import cn.com.eduedu.jee.util.wordnumber.WordNumberFactory; import net.sourceforge.jeuclid.context.LayoutContextImpl; import net.sourceforge.jeuclid.context.Parameter; import net.sourceforge.jeuclid.converter.Converter;