关于lucene处置word、excel、ppt、txt、pdf等格式文件
关于lucene处理word、excel、ppt、txt、pdf等格式文件
关于lucene处理word、excel、ppt、txt、pdf等格式文件的代码如下:
使用jar包如下:
关于lucene处理word、excel、ppt、txt、pdf等格式文件的代码如下:
public class ReadFile { /** * 处理word2003 * @param path * @return * @throws Exception */ public static String readWord(String path) throws Exception { String bodyText = null; InputStream inputStream = new FileInputStream(path); WordExtractor extractor = new WordExtractor(inputStream); bodyText = extractor.getText(); return bodyText; } /** * 处理word2007 * @param path * @return * @throws IOException * @throws OpenXML4JException * @throws XmlException */ public static String readWord2007(String path) throws IOException, OpenXML4JException, XmlException { OPCPackage opcPackage = POIXMLDocument.openPackage(path); POIXMLTextExtractor ex = new XWPFWordExtractor(opcPackage); return ex.getText(); } /** * 处理excel2003 * @param path * @return * @throws IOException */ public static String ReadExcel(String path) throws IOException { InputStream inputStream = null; String content = null; try { inputStream = new FileInputStream(path); HSSFWorkbook wb = new HSSFWorkbook(inputStream); ExcelExtractor extractor = new ExcelExtractor(wb); extractor.setFormulasNotResults(true); extractor.setIncludeSheetNames(false); content = extractor.getText(); } catch (FileNotFoundException e) { e.printStackTrace(); } return content; } /** * 处理excel2007 * @param path * @return * @throws IOException */ public static String readExcel2007(String path) throws IOException { StringBuffer content = new StringBuffer(); // 构造 XSSFWorkbook 对象,strPath 传入文件路径 XSSFWorkbook xwb = new XSSFWorkbook(path); // 循环工作表Sheet for (int numSheet = 0; numSheet < xwb.getNumberOfSheets(); numSheet++) { XSSFSheet xSheet = xwb.getSheetAt(numSheet); if (xSheet == null) { continue; } // 循环行Row for (int rowNum = 0; rowNum <= xSheet.getLastRowNum(); rowNum++) { XSSFRow xRow = xSheet.getRow(rowNum); if (xRow == null) { continue; } // 循环列Cell for (int cellNum = 0; cellNum <= xRow.getLastCellNum(); cellNum++) { XSSFCell xCell = xRow.getCell(cellNum); if (xCell == null) { continue; } if (xCell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN) { content.append(xCell.getBooleanCellValue()); } else if (xCell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC) { content.append(xCell.getNumericCellValue()); } else { content.append(xCell.getStringCellValue()); } } } } return content.toString(); } /** * 处理ppt * @param path * @return */ public static String readPowerPoint(String path) { StringBuffer content = new StringBuffer(""); InputStream inputStream = null; try { inputStream = new FileInputStream(path); SlideShow ss = new SlideShow(new HSLFSlideShow(new FileInputStream(path)));// is // 为文件的InputStream,建立SlideShow Slide[] slides = ss.getSlides();// 获得每一张幻灯片 for (int i = 0; i < slides.length; i++) { TextRun[] t = slides[i].getTextRuns();// 为了取得幻灯片的文字内容,建立TextRun for (int j = 0; j < t.length; j++) { content.append(t[j].getText());// 这里会将文字内容加到content中去 } } } catch (Exception ex) { System.out.println(ex.toString()); } return content.toString(); } /** * 处理pdf * @param path * @return * @throws IOException */ public static String readPdf(String path) throws IOException { StringBuffer content = new StringBuffer("");// 文档内容 PDDocument pdfDocument = null; try { FileInputStream fis = new FileInputStream(path); PDFTextStripper stripper = new PDFTextStripper(); pdfDocument = PDDocument.load(fis); StringWriter writer = new StringWriter(); stripper.writeText(pdfDocument, writer); content.append(writer.getBuffer().toString()); fis.close(); } catch (java.io.IOException e) { System.err.println("IOException=" + e); System.exit(1); } finally { if (pdfDocument != null) { org.pdfbox.cos.COSDocument cos = pdfDocument.getDocument(); cos.close(); pdfDocument.close(); } } return content.toString(); } /** * 处理txt * @param path * @return * @throws IOException */ public static String readTxt(String path) throws IOException { StringBuffer sb = new StringBuffer(""); InputStream is = new FileInputStream(path); // 必须设置成GBK,否则将出现乱码 BufferedReader reader = new BufferedReader(new InputStreamReader(is, "GBK")); try { String line = ""; while ((line = reader.readLine()) != null) { sb.append(line + "\r"); } } catch (FileNotFoundException e) { e.printStackTrace(); } return sb.toString().trim(); } }
使用jar包如下:
1 楼
morningking
2011-07-16
能否将这个工程发给我,谢谢,243717042@qq.com,我和你的一样,引得包也一样,但是却总是运行不成功。
2 楼
zcool321
2012-01-03
太感谢了 正需要这个~!~及时雨啊