You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ExtractText.java 5.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. import java.io.FileInputStream;
  2. import java.io.FileNotFoundException;
  3. import java.io.IOException;
  4. import java.io.InputStream;
  5. import java.lang.StringBuffer;
  6. // https://svn.apache.org/repos/asf/poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
  7. import org.apache.poi.POIOLE2TextExtractor;
  8. import org.apache.poi.POITextExtractor;
  9. //import org.apache.poi.POIDataSamples;
  10. //import org.apache.poi.extractor.*;
  11. import org.apache.poi.extractor.ExtractorFactory;
  12. import org.apache.poi.hdgf.extractor.VisioTextExtractor;
  13. import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
  14. import org.apache.poi.hslf.extractor.PowerPointExtractor;
  15. import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
  16. import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
  17. import org.apache.poi.hssf.extractor.ExcelExtractor;
  18. import org.apache.poi.hwpf.extractor.Word6Extractor;
  19. import org.apache.poi.hwpf.extractor.WordExtractor;
  20. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  21. import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
  22. import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
  23. import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
  24. import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
  25. import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
  26. import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
  27. import org.apache.poi.xslf.usermodel.XMLSlideShow; // pptx 2007, http://poi.apache.org/apidocs/org/apache/poi/xslf/
  28. import org.apache.poi.xwpf.usermodel.XWPFDocument; // docx 2007, http://poi.apache.org/apidocs/org/apache/poi/xwpf/
  29. import org.apache.poi.xssf.usermodel.XSSFWorkbook; // xlsx 2007, http://poi.apache.org/apidocs/org/apache/poi/xssf/
  30. class ExtractText
  31. {
  32. public static String file(String path) {
  33. try { return pptx(new FileInputStream(path)); } catch(Exception e) { }
  34. try { return docx(new FileInputStream(path)); } catch(Exception e) { }
  35. try { return xlsx(new FileInputStream(path)); } catch(Exception e) { }
  36. return "";
  37. }
  38. public static String pptx(InputStream in) throws Exception {
  39. XSLFPowerPointExtractor o = new XSLFPowerPointExtractor( new XMLSlideShow(in) );
  40. o.setSlidesByDefault(true);
  41. o.setNotesByDefault(true);
  42. return o.getText();
  43. }
  44. public static String docx(InputStream in) throws Exception {
  45. XWPFWordExtractor o = new XWPFWordExtractor(new XWPFDocument(in));
  46. return o.getText();
  47. }
  48. public static String xlsx(InputStream in) throws Exception {
  49. XSSFExcelExtractor o = new XSSFExcelExtractor(new XSSFWorkbook(in));
  50. return o.getText();
  51. }
  52. public static void main(String argv[]) {
  53. try {
  54. InputStream in = null;
  55. if (argv.length < 1)
  56. in = System.in;
  57. else
  58. in = new FileInputStream(argv[0]);
  59. StringBuffer output = new StringBuffer();
  60. POITextExtractor textExtractor = ExtractorFactory.createExtractor(in);
  61. if (textExtractor instanceof ExcelExtractor) // xls, excel 97-2003
  62. {
  63. ExcelExtractor extractor = (ExcelExtractor) textExtractor;
  64. output.append(extractor.getText());
  65. }
  66. else if (textExtractor instanceof XSSFExcelExtractor) // xlsx, excel 2007
  67. {
  68. XSSFExcelExtractor extractor = (XSSFExcelExtractor) textExtractor;
  69. output.append(extractor.getText());
  70. }
  71. else if (textExtractor instanceof Word6Extractor) // doc, word 95
  72. {
  73. Word6Extractor extractor = (Word6Extractor) textExtractor;
  74. output.append(extractor.getText());
  75. }
  76. else if (textExtractor instanceof WordExtractor) // doc, word 97-2003
  77. {
  78. WordExtractor extractor = (WordExtractor) textExtractor;
  79. output.append(extractor.getText());
  80. }
  81. else if (textExtractor instanceof XWPFWordExtractor) // docx, word 2007
  82. {
  83. XWPFWordExtractor extractor = (XWPFWordExtractor) textExtractor;
  84. output.append(extractor.getText());
  85. }
  86. else if (textExtractor instanceof PowerPointExtractor) // ppt, ppt 97-2003
  87. {
  88. PowerPointExtractor extractor = (PowerPointExtractor) textExtractor;
  89. output.append(extractor.getText());
  90. output.append(extractor.getNotes());
  91. }
  92. else if (textExtractor instanceof XSLFPowerPointExtractor ) // pptx, powerpoint 2007
  93. {
  94. XSLFPowerPointExtractor extractor = (XSLFPowerPointExtractor) textExtractor;
  95. extractor.setSlidesByDefault(true);
  96. extractor.setNotesByDefault(true);
  97. output.append(extractor.getText());
  98. }
  99. else if (textExtractor instanceof VisioTextExtractor) // vsd, visio
  100. {
  101. VisioTextExtractor extractor = (VisioTextExtractor) textExtractor;
  102. output.append(extractor.getText());
  103. }
  104. else if (textExtractor instanceof PublisherTextExtractor) // pub, publisher
  105. {
  106. PublisherTextExtractor extractor = (PublisherTextExtractor) textExtractor;
  107. output.append(extractor.getText());
  108. }
  109. else if (textExtractor instanceof OutlookTextExtactor) // msg, outlook
  110. {
  111. OutlookTextExtactor extractor = (OutlookTextExtactor) textExtractor;
  112. output.append(extractor.getText());
  113. }
  114. System.out.println(output.toString().replaceAll( "[\n\t\r ]+"," "));
  115. }
  116. catch (Exception e)
  117. {
  118. // TODO Auto-generated catch block
  119. //e.printStackTrace();
  120. //System.out.println(e);
  121. }
  122. }
  123. }

人工智能研发终端

Contributors (2)