import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.lang.StringBuffer; // https://svn.apache.org/repos/asf/poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.POITextExtractor; //import org.apache.poi.POIDataSamples; //import org.apache.poi.extractor.*; import org.apache.poi.extractor.ExtractorFactory; import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hpbf.extractor.PublisherTextExtractor; import org.apache.poi.hslf.extractor.PowerPointExtractor; import org.apache.poi.hsmf.extractor.OutlookTextExtactor; import org.apache.poi.hssf.extractor.EventBasedExcelExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hwpf.extractor.Word6Extractor; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; import org.apache.poi.xssf.extractor.XSSFExcelExtractor; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.poifs.filesystem.OfficeXmlFileException; import org.apache.poi.xslf.usermodel.XMLSlideShow; // pptx 2007, http://poi.apache.org/apidocs/org/apache/poi/xslf/ import org.apache.poi.xwpf.usermodel.XWPFDocument; // docx 2007, http://poi.apache.org/apidocs/org/apache/poi/xwpf/ import org.apache.poi.xssf.usermodel.XSSFWorkbook; // xlsx 2007, http://poi.apache.org/apidocs/org/apache/poi/xssf/ class ExtractText { public static String file(String path) { try { return pptx(new FileInputStream(path)); } catch(Exception e) { } try { return docx(new FileInputStream(path)); } catch(Exception e) { } try { return xlsx(new FileInputStream(path)); } catch(Exception e) { } return ""; } public static String pptx(InputStream in) throws Exception { XSLFPowerPointExtractor o = new XSLFPowerPointExtractor( new XMLSlideShow(in) ); o.setSlidesByDefault(true); o.setNotesByDefault(true); return o.getText(); } public static String docx(InputStream in) throws Exception { XWPFWordExtractor o = new XWPFWordExtractor(new XWPFDocument(in)); return o.getText(); } public static String xlsx(InputStream in) throws Exception { XSSFExcelExtractor o = new XSSFExcelExtractor(new XSSFWorkbook(in)); return o.getText(); } public static void main(String argv[]) { try { InputStream in = null; if (argv.length < 1) in = System.in; else in = new FileInputStream(argv[0]); StringBuffer output = new StringBuffer(); POITextExtractor textExtractor = ExtractorFactory.createExtractor(in); if (textExtractor instanceof ExcelExtractor) // xls, excel 97-2003 { ExcelExtractor extractor = (ExcelExtractor) textExtractor; output.append(extractor.getText()); } else if (textExtractor instanceof XSSFExcelExtractor) // xlsx, excel 2007 { XSSFExcelExtractor extractor = (XSSFExcelExtractor) textExtractor; output.append(extractor.getText()); } else if (textExtractor instanceof Word6Extractor) // doc, word 95 { Word6Extractor extractor = (Word6Extractor) textExtractor; output.append(extractor.getText()); } else if (textExtractor instanceof WordExtractor) // doc, word 97-2003 { WordExtractor extractor = (WordExtractor) textExtractor; output.append(extractor.getText()); } else if (textExtractor instanceof XWPFWordExtractor) // docx, word 2007 { XWPFWordExtractor extractor = (XWPFWordExtractor) textExtractor; output.append(extractor.getText()); } else if (textExtractor instanceof PowerPointExtractor) // ppt, ppt 97-2003 { PowerPointExtractor extractor = (PowerPointExtractor) textExtractor; output.append(extractor.getText()); output.append(extractor.getNotes()); } else if (textExtractor instanceof XSLFPowerPointExtractor ) // pptx, powerpoint 2007 { XSLFPowerPointExtractor extractor = (XSLFPowerPointExtractor) textExtractor; extractor.setSlidesByDefault(true); extractor.setNotesByDefault(true); output.append(extractor.getText()); } else if (textExtractor instanceof VisioTextExtractor) // vsd, visio { VisioTextExtractor extractor = (VisioTextExtractor) textExtractor; output.append(extractor.getText()); } else if (textExtractor instanceof PublisherTextExtractor) // pub, publisher { PublisherTextExtractor extractor = (PublisherTextExtractor) textExtractor; output.append(extractor.getText()); } else if (textExtractor instanceof OutlookTextExtactor) // msg, outlook { OutlookTextExtactor extractor = (OutlookTextExtactor) textExtractor; output.append(extractor.getText()); } System.out.println(output.toString().replaceAll( "[\n\t\r ]+"," ")); } catch (Exception e) { // TODO Auto-generated catch block //e.printStackTrace(); //System.out.println(e); } } }