经验分享:java文档解析【java学习笔记kool】Java开发工程师
文档解析
解析常见的word,excle,ppt,pdf等文档转换成字符串输出。
word解析:
word解析采用apche的开源poi技术,用的时候把里面所有的jar包都需装上
public static void main(String[] args) {
// TODO Auto-generated method stub
//读取word文件,指定文件路径
File file=new File("D:/软件/qq消息/2846227347/FileRecv/qq.doc");
//封装到流中
InputStream fileInputStream = null;
try {
fileInputStream = new FileInputStream(file);
} catch (FileNotFoundException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
try {//微软的解析
XWPFDocument xwpfDocument=new XWPFDocument(fileInputStream);
XWPFWordExtractor xwpfWordExtractor=new XWPFWordExtractor(xwpfDocument);
//WordExtractor wordExtractor=new WordExtractor(fileInputStream);
String text = xwpfWordExtractor.getText();
//wordExtractor.getParagraphText();
System.out.println(text);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
try {//金山的解析
fileInputStream=new FileInputStream(file);
WordExtractor wordExtractor=new WordExtractor(fileInputStream);
String text = wordExtractor.getText();//获取文本内容直接打印输出
System.out.println(text);
String textFromPieces = wordExtractor.getTextFromPieces();
System.out.println(textFromPieces);
String[] commentsText = wordExtractor.getCommentsText();
for (String string : commentsText) {
System.out.println(string);
}
System.out.println(textFromPieces);
} catch (Exception e2) {
// TODO: handle exception
}
}
excle解析:
还是采用poi技术
public static void main(String[] args) {
// TODO Auto-generated method stub
//作为两个实例用,过程一样
String fileAddr="C:/Users/kaitao/Desktop/打印/请问.xls";
String file2Addr="D:/新建 Microsoft Excel 工作表.xlsx";
File file=new File(fileAddr);
File file2=new File(file2Addr);
FileInputStream fileInputStream=null;
try {
fileInputStream = new FileInputStream(file2);
} catch (FileNotFoundException e2) {
// TODO Auto-generated catch block
e2.printStackTrace();
}
//程序按理说也可以向worder的方式运行,但是程序表示流以关闭,之后试着用了很多方式,都不行,最后只能进行判断了
if (file2Addr.substring(file2Addr.lastIndexOf(".")).equals(".xls")) {
try {//解析xls
HSSFWorkbook wb=new HSSFWorkbook(fileInputStream);
ExcelExtractor excelExtractor=new ExcelExtractor(wb);
String text = excelExtractor.getText();
System.out.println(text);
excelExtractor.close();
wb.close();
} catch (Exception e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
}else if (file2Addr.substring(file2Addr.lastIndexOf(".")).equals(".xlsx")) {
try {
XSSFWorkbook xssfWorkbook=new XSSFWorkbook(fileInputStream);
for (Sheet sheet : xssfWorkbook) {
for (Row row : sheet) {
for (Cell cell : row) {
System.out.println(cell);
}
}
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
pdf解析:
采用的pdfbox这个开源技术,同样把所有的jar包导进去,映射版的pdf文件不能解析,大家可以试着用itext7试试看
public static void main(String[] args) {
// TODO Auto-generated method stub
File file=new File("C://Users//kaitao//Desktop//打印//we.pdf");
InputStream inputStream= null;
try {
inputStream=new FileInputStream(file);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
DataInputStream dataInputStream=new DataInputStream(inputStream);
try {
PDDocument load = PDDocument.load(fileInputStream);
PDFTextStripper pdfTextStripper=new PDFTextStripper();
String text = pdfTextStripper.getText(load);
System.out.println(text);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
ppt解析:
采用的还是poi的开源技术,只能解析ppt形式的pptx形式的解析不了。
public static void main(String[] args) {
InputStream inputStream = null;
try {
inputStream = new FileInputStream("C:/Users/kaitao/Desktop/打印/请问.ppt");
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
try {
PowerPointExtractor extractor = new PowerPointExtractor(inputStream);
String text = extractor.getText();
System.out.println(text);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}


1914篇文章