2012年4月18日 星期三

[Java] 使用 Apache POI 擷取 Word、Excel、PowerPoint 文字


Apache POI - the Java API for Microsoft Documents


這次用 POI 的目的是練習把 Word、Excel 和 PowerPoint 裡頭的文字擷取出來,如果用 "extract"、"text" 和 "doc"、"docx"、"ppt" 和 "pptx" 很容易找到網路上用 Apache POI 所作的範例,此次我就拿這套來練習,那目標呢?我翻了一下 POI 的程式碼,至少要做到 doc、docx、ppt、pptx、xls、xlsx 的文字抽取,而測次結果的確都做得到了。


首先我很懶地看文字,只稍微看到官網的 Text Extraction 介紹,但那段就成為我程式的主要架構了,經過多次的旁敲側擊,讓我發現挖到寶的是 TestExtractorFactory.java!真的是太讚啦,簡單地說我想要了解 POI 跟如何去做的事,完完全全看 TestExtractorFactory.java 就搞定了!包含支援哪些檔案格式、怎樣用,全部清清楚楚,看來從 test case 開始也是另類的偷懶式學習。


最後,程式碼:


import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.lang.StringBuffer;


// https://svn.apache.org/repos/asf/poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.POITextExtractor;
//import org.apache.poi.POIDataSamples;
//import org.apache.poi.extractor.*;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.Word6Extractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;


import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.poifs.filesystem.OfficeXmlFileException;


import org.apache.poi.xslf.usermodel.XMLSlideShow; // pptx 2007, http://poi.apache.org/apidocs/org/apache/poi/xslf/
import org.apache.poi.xwpf.usermodel.XWPFDocument; // docx 2007, http://poi.apache.org/apidocs/org/apache/poi/xwpf/
import org.apache.poi.xssf.usermodel.XSSFWorkbook; // xlsx 2007, http://poi.apache.org/apidocs/org/apache/poi/xssf/


class ExtractText
{
    public static String file(String path)
    {
        try { return pptx(new FileInputStream(path)); } catch(Exception e) { }
        try { return docx(new FileInputStream(path)); } catch(Exception e) { }
        try { return xlsx(new FileInputStream(path)); } catch(Exception e) { }
        return "";
    }
    public static String pptx(InputStream in) throws Exception
    {
        XSLFPowerPointExtractor o = new XSLFPowerPointExtractor( new XMLSlideShow(in) );
        o.setSlidesByDefault(true);
        o.setNotesByDefault(true);
        return o.getText();
    }
    public static String docx(InputStream in) throws Exception
    {
        XWPFWordExtractor o = new XWPFWordExtractor(new XWPFDocument(in));
        return o.getText();
    }
    public static String xlsx(InputStream in) throws Exception
    {
        XSSFExcelExtractor o = new XSSFExcelExtractor(new XSSFWorkbook(in));
        return o.getText();
    }
    public static void main(String argv[])
    {
        try
        {
            InputStream in = null;
            if( argv.length < 1 )
                in = System.in;
            else
                in = new FileInputStream(path);
            StringBuffer output = new StringBuffer();
            POITextExtractor textExtractor = ExtractorFactory.createExtractor(in);
            //POIFSFileSystem fileSystem = new POIFSFileSystem(in);
            //POITextExtractor textExtractor = ExtractorFactory.createExtractor(fileSystem);
            //POIOLE2TextExtractor oleTextExtractor = ExtractorFactory.createExtractor(fileSystem);
            //POITextExtractor[] embeddedExtractors = ExtractorFactory.getEmbededDocsTextExtractors(oleTextExtractor);
            //for (POITextExtractor textExtractor : embeddedExtractors)
            {
                if (textExtractor instanceof ExcelExtractor) // xls, excel 97-2003
                {
                    ExcelExtractor extractor = (ExcelExtractor) textExtractor;
                    //System.out.println(extractor.getText());
                    output.append(extractor.getText());
                }
                else if (textExtractor instanceof XSSFExcelExtractor) // xlsx, excel 2007
                {
                    XSSFExcelExtractor extractor = (XSSFExcelExtractor) textExtractor;
                    //System.out.println(extractor.getText());
                    output.append(extractor.getText());
                }
                else if (textExtractor instanceof Word6Extractor) // doc, word 95
                {
                    Word6Extractor extractor = (Word6Extractor) textExtractor;
                    // http://poi.apache.org/apidocs/org/apache/poi/hwpf/extractor/Word6Extractor.html
                    //for (String paragraph : extractor.getParagraphText() )
                    // System.out.println(paragraph);
                    //System.out.println(extractor.getText());
                    output.append(extractor.getText());
                }
                else if (textExtractor instanceof WordExtractor) // doc, word 97-2003
                {
                    WordExtractor extractor = (WordExtractor) textExtractor;
                    // http://poi.apache.org/apidocs/org/apache/poi/hwpf/extractor/WordExtractor.html
                    //System.out.println(extractor.getHeaderText());
                    //System.out.println(extractor.getFooterText());
                    //for (String paragraph : extractor.getParagraphText() )
                    // System.out.println(paragraph);
                    //System.out.println(extractor.getText());
                    output.append(extractor.getText());
                }
                else if (textExtractor instanceof XWPFWordExtractor) // docx, word 2007
                {
                    XWPFWordExtractor extractor = (XWPFWordExtractor) textExtractor;
                    //System.out.println(extractor.getText());
                    output.append(extractor.getText());
                }
                else if (textExtractor instanceof PowerPointExtractor) // ppt, ppt 97-2003
                {
                    PowerPointExtractor extractor = (PowerPointExtractor) textExtractor;
                    //System.out.println(extractor.getText());
                    //System.out.println(extractor.getNotes());
                    output.append(extractor.getText());
                    output.append(extractor.getNotes());
                }
                else if (textExtractor instanceof XSLFPowerPointExtractor ) // pptx, powerpoint 2007
                {
                    XSLFPowerPointExtractor extractor = (XSLFPowerPointExtractor) textExtractor;
                    extractor.setSlidesByDefault(true);
                    extractor.setNotesByDefault(true);
                    //System.out.println(extractor.getText());
                    output.append(extractor.getText());
                }
                else if (textExtractor instanceof VisioTextExtractor) // vsd, visio
                {
                    VisioTextExtractor extractor = (VisioTextExtractor) textExtractor;
                    //System.out.println(extractor.getText());
                    output.append(extractor.getText());
                }
                else if (textExtractor instanceof PublisherTextExtractor) // pub, publisher
                {
                    PublisherTextExtractor extractor = (PublisherTextExtractor) textExtractor;
                    //System.out.println(extractor.getText());
                    output.append(extractor.getText());
                }
                else if (textExtractor instanceof OutlookTextExtactor) // msg, outlook
                {
                    OutlookTextExtactor extractor = (OutlookTextExtactor) textExtractor;
                    //System.out.println(extractor.getText());
                    output.append(extractor.getText());
                }
            }
            System.out.println(output.toString().replaceAll( "[\n\t\r ]+"," "));
        }
        catch (Exception e)
        {
            // TODO Auto-generated catch block
            // e.printStackTrace();
            // System.out.println(e);
        }
    }
}


編譯,使用 poi-bin-3.8-20120326.zip 版本:


$ javac -Xlint:deprecation -cp .:lib-3.8/commons-logging-1.1.jar:lib-3.8/poi-3.8-20120326.jar:lib-3.8/poi-ooxml-schemas-3.8-20120326.jar:lib-3.8/dom4j-1.6.1.jar:lib-3.8/poi-examples-3.8-20120326.jar:lib-3.8/poi-scratchpad-3.8-20120326.jar:lib-3.8/junit-3.8.1.jar:lib-3.8/poi-excelant-3.8-20120326.jar:lib-3.8/stax-api-1.0.1.jar:lib-3.8/log4j-1.2.13.jar:lib-3.8/poi-ooxml-3.8-20120326.jar:lib-3.8/xmlbeans-2.3.0.jar ExtractText.java


執行,使用 poi-bin-3.8-20120326.zip 版本:


$ java -cp .:lib-3.8/commons-logging-1.1.jar:lib-3.8/poi-3.8-20120326.jar:lib-3.8/poi-ooxml-schemas-3.8-20120326.jar:lib-3.8/dom4j-1.6.1.jar:lib-3.8/poi-examples-3.8-20120326.jar:lib-3.8/poi-scratchpad-3.8-20120326.jar:lib-3.8/junit-3.8.1.jar:lib-3.8/poi-excelant-3.8-20120326.jar:lib-3.8/stax-api-1.0.1.jar:lib-3.8/log4j-1.2.13.jar:lib-3.8/poi-ooxml-3.8-20120326.jar:lib-3.8/xmlbeans-2.3.0.jar ExtractText < MyOfficeFile2003.doc


$ java -cp .:lib-3.8/commons-logging-1.1.jar:lib-3.8/poi-3.8-20120326.jar:lib-3.8/poi-ooxml-schemas-3.8-20120326.jar:lib-3.8/dom4j-1.6.1.jar:lib-3.8/poi-examples-3.8-20120326.jar:lib-3.8/poi-scratchpad-3.8-20120326.jar:lib-3.8/junit-3.8.1.jar:lib-3.8/poi-excelant-3.8-20120326.jar:lib-3.8/stax-api-1.0.1.jar:lib-3.8/log4j-1.2.13.jar:lib-3.8/poi-ooxml-3.8-20120326.jar:lib-3.8/xmlbeans-2.3.0.jar ExtractText < MyOfficeFile2003.xls


$ java -cp .:lib-3.8/commons-logging-1.1.jar:lib-3.8/poi-3.8-20120326.jar:lib-3.8/poi-ooxml-schemas-3.8-20120326.jar:lib-3.8/dom4j-1.6.1.jar:lib-3.8/poi-examples-3.8-20120326.jar:lib-3.8/poi-scratchpad-3.8-20120326.jar:lib-3.8/junit-3.8.1.jar:lib-3.8/poi-excelant-3.8-20120326.jar:lib-3.8/stax-api-1.0.1.jar:lib-3.8/log4j-1.2.13.jar:lib-3.8/poi-ooxml-3.8-20120326.jar:lib-3.8/xmlbeans-2.3.0.jar ExtractText < MyOfficeFile2003.ppt


$ java -cp .:lib-3.8/commons-logging-1.1.jar:lib-3.8/poi-3.8-20120326.jar:lib-3.8/poi-ooxml-schemas-3.8-20120326.jar:lib-3.8/dom4j-1.6.1.jar:lib-3.8/poi-examples-3.8-20120326.jar:lib-3.8/poi-scratchpad-3.8-20120326.jar:lib-3.8/junit-3.8.1.jar:lib-3.8/poi-excelant-3.8-20120326.jar:lib-3.8/stax-api-1.0.1.jar:lib-3.8/log4j-1.2.13.jar:lib-3.8/poi-ooxml-3.8-20120326.jar:lib-3.8/xmlbeans-2.3.0.jar ExtractText < MyOfficeFile2007.docx


$ java -cp .:lib-3.8/commons-logging-1.1.jar:lib-3.8/poi-3.8-20120326.jar:lib-3.8/poi-ooxml-schemas-3.8-20120326.jar:lib-3.8/dom4j-1.6.1.jar:lib-3.8/poi-examples-3.8-20120326.jar:lib-3.8/poi-scratchpad-3.8-20120326.jar:lib-3.8/junit-3.8.1.jar:lib-3.8/poi-excelant-3.8-20120326.jar:lib-3.8/stax-api-1.0.1.jar:lib-3.8/log4j-1.2.13.jar:lib-3.8/poi-ooxml-3.8-20120326.jar:lib-3.8/xmlbeans-2.3.0.jar ExtractText < MyOfficeFile2007.xlsx


$ java -cp .:lib-3.8/commons-logging-1.1.jar:lib-3.8/poi-3.8-20120326.jar:lib-3.8/poi-ooxml-schemas-3.8-20120326.jar:lib-3.8/dom4j-1.6.1.jar:lib-3.8/poi-examples-3.8-20120326.jar:lib-3.8/poi-scratchpad-3.8-20120326.jar:lib-3.8/junit-3.8.1.jar:lib-3.8/poi-excelant-3.8-20120326.jar:lib-3.8/stax-api-1.0.1.jar:lib-3.8/log4j-1.2.13.jar:lib-3.8/poi-ooxml-3.8-20120326.jar:lib-3.8/xmlbeans-2.3.0.jar ExtractText < MyOfficeFile2007.pptx


只能說這軟體架構不錯,僅須稍微修改就能弄成自動判斷格式囉 :D 在此開發的環境是 Ubuntu ,因此在 -cp 參數的數值,多個 jar 檔是用冒號 ":" 分隔的,若是 Windows 的話,要改成對應的(不確定是不是分號 ";")


沒有留言:

張貼留言