lucene如何解析Doc文档
加入poi-scratchpad-3.0.2-FINAL-20080204.jar到lib下
package com.cs;
public interface Parsable {
public String getTitle() ;
public String getContent() ;
public String getSummary() ;
}package com.cs;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.hwpf.extractor.WordExtractor;
public class DocParser implements Parsable {
private File file;
private String content;
private WordExtractor wordExtractor;
public DocParser(File file) {
this.file = file;
}
public String getContent() {
try {
if (content != null) {
return content;
}
InputStream is = null;
is = new FileInputStream(file);
wordExtractor = new WordExtractor(is);
content = wordExtractor.getText();
return content;
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
/**
* summary取内容的前200个字符
*/
public String getSummary() {
String summary;
if (content == null) {
getContent();
}
if (content.length() > 200) {
summary = content.substring(0, 200);
} else {
summary = content;
}
return summary;
}
public String getTitle() {
return file.getName();
}
public static void main(String[] args) {
DocParser docParser = new DocParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\XPDF使用文档.doc")) ;
System.out.println("doc content : "+docParser.getContent()) ;
}
}txt的解析
package com.cs;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
public class TextParser implements Parsable {
private File file ;
private String content ;
public TextParser(File file) {
super();
this.file = file;
}
public String getContent() {
if (content != null ) {
return content ;
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file))) ;
StringBuffer sb = new StringBuffer() ;
String line = null ;
while ((line = br.readLine()) != null) {
sb.append(line).append("\n") ;
}
content = sb.toString() ;
return content ;
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
public String getSummary() {
String summary ;
if (content == null ) {
getContent() ;
}
if (content.length() > 200) {
summary = content.substring(0, 200) ;
}else {
summary = content ;
}
return summary;
}
public String getTitle() {
return file.getName();
}
public static void main(String[] args) {
TextParser textParser = new TextParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\文档.txt")) ;
System.out.println("text content : "+textParser.getContent()) ;
}
} 相关推荐
renjinlong 2020-09-03
Jacry 2020-07-04
IceStreamLab 2020-06-26
mengyue 2020-06-09
PasserbyX 2020-05-16
mameng 2020-05-12
心丨悦 2020-05-06
编码之路 2020-05-03
mengyue 2020-05-02
qiuzhuoxian 2020-02-23
编码之路 2020-02-20
lionelf 2020-02-03
TyCoding 2020-02-01
heniancheng 2020-01-31
某某某 2020-01-30
PinkBean 2020-01-29
某某某 2020-01-12
编码之路 2020-01-01
itmale 2020-01-01