Apache POI - the Java API for Microsoft Documents
這次用 POI 的目的是練習把 Word、Excel 和 PowerPoint 裡頭的文字擷取出來,如果用 "extract"、"text" 和 "doc"、"docx"、"ppt" 和 "pptx" 很容易找到網路上用 Apache POI 所作的範例,此次我就拿這套來練習,那目標呢?我翻了一下 POI 的程式碼,至少要做到 doc、docx、ppt、pptx、xls、xlsx 的文字抽取,而測次結果的確都做得到了。
首先我很懶地看文字,只稍微看到官網的 Text Extraction 介紹,但那段就成為我程式的主要架構了,經過多次的旁敲側擊,讓我發現挖到寶的是 TestExtractorFactory.java!真的是太讚啦,簡單地說我想要了解 POI 跟如何去做的事,完完全全看 TestExtractorFactory.java 就搞定了!包含支援哪些檔案格式、怎樣用,全部清清楚楚,看來從 test case 開始也是另類的偷懶式學習。
最後,程式碼:
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.lang.StringBuffer;
// https://svn.apache.org/repos/asf/poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.POITextExtractor;
//import org.apache.poi.POIDataSamples;
//import org.apache.poi.extractor.*;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.Word6Extractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
import org.apache.poi.xslf.usermodel.XMLSlideShow; // pptx 2007, http://poi.apache.org/apidocs/org/apache/poi/xslf/
import org.apache.poi.xwpf.usermodel.XWPFDocument; // docx 2007, http://poi.apache.org/apidocs/org/apache/poi/xwpf/
import org.apache.poi.xssf.usermodel.XSSFWorkbook; // xlsx 2007, http://poi.apache.org/apidocs/org/apache/poi/xssf/
class ExtractText
{
public static String file(String path)
{
try { return pptx(new FileInputStream(path)); } catch(Exception e) { }
try { return docx(new FileInputStream(path)); } catch(Exception e) { }
try { return xlsx(new FileInputStream(path)); } catch(Exception e) { }
return "";
}
public static String pptx(InputStream in) throws Exception
{
XSLFPowerPointExtractor o = new XSLFPowerPointExtractor( new XMLSlideShow(in) );
o.setSlidesByDefault(true);
o.setNotesByDefault(true);
return o.getText();
}
public static String docx(InputStream in) throws Exception
{
XWPFWordExtractor o = new XWPFWordExtractor(new XWPFDocument(in));
return o.getText();
}
public static String xlsx(InputStream in) throws Exception
{
XSSFExcelExtractor o = new XSSFExcelExtractor(new XSSFWorkbook(in));
return o.getText();
}
public static void main(String argv[])
{
try
{
InputStream in = null;
if( argv.length < 1 )
in = System.in;
else
in = new FileInputStream(path);
StringBuffer output = new StringBuffer();
POITextExtractor textExtractor = ExtractorFactory.createExtractor(in);
//POIFSFileSystem fileSystem = new POIFSFileSystem(in);
//POITextExtractor textExtractor = ExtractorFactory.createExtractor(fileSystem);
//POIOLE2TextExtractor oleTextExtractor = ExtractorFactory.createExtractor(fileSystem);
//POITextExtractor[] embeddedExtractors = ExtractorFactory.getEmbededDocsTextExtractors(oleTextExtractor);
//for (POITextExtractor textExtractor : embeddedExtractors)
{
if (textExtractor instanceof ExcelExtractor) // xls, excel 97-2003
{
ExcelExtractor extractor = (ExcelExtractor) textExtractor;
//System.out.println(extractor.getText());
output.append(extractor.getText());
}
else if (textExtractor instanceof XSSFExcelExtractor) // xlsx, excel 2007
{
XSSFExcelExtractor extractor = (XSSFExcelExtractor) textExtractor;
//System.out.println(extractor.getText());
output.append(extractor.getText());
}
else if (textExtractor instanceof Word6Extractor) // doc, word 95
{
Word6Extractor extractor = (Word6Extractor) textExtractor;
// http://poi.apache.org/apidocs/org/apache/poi/hwpf/extractor/Word6Extractor.html
//for (String paragraph : extractor.getParagraphText() )
// System.out.println(paragraph);
//System.out.println(extractor.getText());
output.append(extractor.getText());
}
else if (textExtractor instanceof WordExtractor) // doc, word 97-2003
{
WordExtractor extractor = (WordExtractor) textExtractor;
// http://poi.apache.org/apidocs/org/apache/poi/hwpf/extractor/WordExtractor.html
//System.out.println(extractor.getHeaderText());
//System.out.println(extractor.getFooterText());
//for (String paragraph : extractor.getParagraphText() )
// System.out.println(paragraph);
//System.out.println(extractor.getText());
output.append(extractor.getText());
}
else if (textExtractor instanceof XWPFWordExtractor) // docx, word 2007
{
XWPFWordExtractor extractor = (XWPFWordExtractor) textExtractor;
//System.out.println(extractor.getText());
output.append(extractor.getText());
}
else if (textExtractor instanceof PowerPointExtractor) // ppt, ppt 97-2003
{
PowerPointExtractor extractor = (PowerPointExtractor) textExtractor;
//System.out.println(extractor.getText());
//System.out.println(extractor.getNotes());
output.append(extractor.getText());
output.append(extractor.getNotes());
}
else if (textExtractor instanceof XSLFPowerPointExtractor ) // pptx, powerpoint 2007
{
XSLFPowerPointExtractor extractor = (XSLFPowerPointExtractor) textExtractor;
extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(true);
//System.out.println(extractor.getText());
output.append(extractor.getText());
}
else if (textExtractor instanceof VisioTextExtractor) // vsd, visio
{
VisioTextExtractor extractor = (VisioTextExtractor) textExtractor;
//System.out.println(extractor.getText());
output.append(extractor.getText());
}
else if (textExtractor instanceof PublisherTextExtractor) // pub, publisher
{
PublisherTextExtractor extractor = (PublisherTextExtractor) textExtractor;
//System.out.println(extractor.getText());
output.append(extractor.getText());
}
else if (textExtractor instanceof OutlookTextExtactor) // msg, outlook
{
OutlookTextExtactor extractor = (OutlookTextExtactor) textExtractor;
//System.out.println(extractor.getText());
output.append(extractor.getText());
}
}
System.out.println(output.toString().replaceAll( "[\n\t\r ]+"," "));
}
catch (Exception e)
{
// TODO Auto-generated catch block
// e.printStackTrace();
// System.out.println(e);
}
}
}
編譯,使用 poi-bin-3.8-20120326.zip 版本:
$ javac -Xlint:deprecation -cp .:lib-3.8/commons-logging-1.1.jar:lib-3.8/poi-3.8-20120326.jar:lib-3.8/poi-ooxml-schemas-3.8-20120326.jar:lib-3.8/dom4j-1.6.1.jar:lib-3.8/poi-examples-3.8-20120326.jar:lib-3.8/poi-scratchpad-3.8-20120326.jar:lib-3.8/junit-3.8.1.jar:lib-3.8/poi-excelant-3.8-20120326.jar:lib-3.8/stax-api-1.0.1.jar:lib-3.8/log4j-1.2.13.jar:lib-3.8/poi-ooxml-3.8-20120326.jar:lib-3.8/xmlbeans-2.3.0.jar ExtractText.java
執行,使用 poi-bin-3.8-20120326.zip 版本:
$ java -cp .:lib-3.8/commons-logging-1.1.jar:lib-3.8/poi-3.8-20120326.jar:lib-3.8/poi-ooxml-schemas-3.8-20120326.jar:lib-3.8/dom4j-1.6.1.jar:lib-3.8/poi-examples-3.8-20120326.jar:lib-3.8/poi-scratchpad-3.8-20120326.jar:lib-3.8/junit-3.8.1.jar:lib-3.8/poi-excelant-3.8-20120326.jar:lib-3.8/stax-api-1.0.1.jar:lib-3.8/log4j-1.2.13.jar:lib-3.8/poi-ooxml-3.8-20120326.jar:lib-3.8/xmlbeans-2.3.0.jar ExtractText < MyOfficeFile2003.doc
$ java -cp .:lib-3.8/commons-logging-1.1.jar:lib-3.8/poi-3.8-20120326.jar:lib-3.8/poi-ooxml-schemas-3.8-20120326.jar:lib-3.8/dom4j-1.6.1.jar:lib-3.8/poi-examples-3.8-20120326.jar:lib-3.8/poi-scratchpad-3.8-20120326.jar:lib-3.8/junit-3.8.1.jar:lib-3.8/poi-excelant-3.8-20120326.jar:lib-3.8/stax-api-1.0.1.jar:lib-3.8/log4j-1.2.13.jar:lib-3.8/poi-ooxml-3.8-20120326.jar:lib-3.8/xmlbeans-2.3.0.jar ExtractText < MyOfficeFile2003.xls
$ java -cp .:lib-3.8/commons-logging-1.1.jar:lib-3.8/poi-3.8-20120326.jar:lib-3.8/poi-ooxml-schemas-3.8-20120326.jar:lib-3.8/dom4j-1.6.1.jar:lib-3.8/poi-examples-3.8-20120326.jar:lib-3.8/poi-scratchpad-3.8-20120326.jar:lib-3.8/junit-3.8.1.jar:lib-3.8/poi-excelant-3.8-20120326.jar:lib-3.8/stax-api-1.0.1.jar:lib-3.8/log4j-1.2.13.jar:lib-3.8/poi-ooxml-3.8-20120326.jar:lib-3.8/xmlbeans-2.3.0.jar ExtractText < MyOfficeFile2003.ppt
$ java -cp .:lib-3.8/commons-logging-1.1.jar:lib-3.8/poi-3.8-20120326.jar:lib-3.8/poi-ooxml-schemas-3.8-20120326.jar:lib-3.8/dom4j-1.6.1.jar:lib-3.8/poi-examples-3.8-20120326.jar:lib-3.8/poi-scratchpad-3.8-20120326.jar:lib-3.8/junit-3.8.1.jar:lib-3.8/poi-excelant-3.8-20120326.jar:lib-3.8/stax-api-1.0.1.jar:lib-3.8/log4j-1.2.13.jar:lib-3.8/poi-ooxml-3.8-20120326.jar:lib-3.8/xmlbeans-2.3.0.jar ExtractText < MyOfficeFile2007.docx
$ java -cp .:lib-3.8/commons-logging-1.1.jar:lib-3.8/poi-3.8-20120326.jar:lib-3.8/poi-ooxml-schemas-3.8-20120326.jar:lib-3.8/dom4j-1.6.1.jar:lib-3.8/poi-examples-3.8-20120326.jar:lib-3.8/poi-scratchpad-3.8-20120326.jar:lib-3.8/junit-3.8.1.jar:lib-3.8/poi-excelant-3.8-20120326.jar:lib-3.8/stax-api-1.0.1.jar:lib-3.8/log4j-1.2.13.jar:lib-3.8/poi-ooxml-3.8-20120326.jar:lib-3.8/xmlbeans-2.3.0.jar ExtractText < MyOfficeFile2007.xlsx
$ java -cp .:lib-3.8/commons-logging-1.1.jar:lib-3.8/poi-3.8-20120326.jar:lib-3.8/poi-ooxml-schemas-3.8-20120326.jar:lib-3.8/dom4j-1.6.1.jar:lib-3.8/poi-examples-3.8-20120326.jar:lib-3.8/poi-scratchpad-3.8-20120326.jar:lib-3.8/junit-3.8.1.jar:lib-3.8/poi-excelant-3.8-20120326.jar:lib-3.8/stax-api-1.0.1.jar:lib-3.8/log4j-1.2.13.jar:lib-3.8/poi-ooxml-3.8-20120326.jar:lib-3.8/xmlbeans-2.3.0.jar ExtractText < MyOfficeFile2007.pptx
只能說這軟體架構不錯,僅須稍微修改就能弄成自動判斷格式囉 :D 在此開發的環境是 Ubuntu ,因此在 -cp 參數的數值,多個 jar 檔是用冒號 ":" 分隔的,若是 Windows 的話,要改成對應的(不確定是不是分號 ";")