lucene自定义分词器
感谢http://qindongliang1922.iteye.com/blog/1927605
这篇文章让我豁然开朗~
建议研究lucene时一定要下载源码
下面代码中有个bug,,,一开始没弄没明白,在用这个分词器进行索引后发现搜不到东西。。是tokenStart和tokenEnd的错,这2个表示该词所在位置,,我这样说不知道对不对,但我感觉我的意思已经表达出来
package TEST;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.AttributeFactory;
public class My extends Tokenizer {
private final StringBuilder buffer = new StringBuilder();
private int tokenStart = 0, tokenEnd = 0;
private final static String PUNCTION = " -()/";
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
//private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
public My(Reader reader) {
super(reader);
}
public My(AttributeFactory factory, Reader input) {
super(factory, input);
}
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
buffer.setLength(0);
int ci;
char ch;
tokenStart = tokenEnd;
ci = input.read();
if(ci>64&&ci<91){
ci=ci+32;
}
ch = (char) ci;
while (true) {
if (ci == -1){
if (buffer.length() == 0)
return false;
else {
termAtt.setEmpty().append(buffer);
offsetAtt.setOffset(correctOffset(tokenStart),
correctOffset(tokenEnd));
return true;
}
}
else if (PUNCTION.indexOf(ch) != -1) {
//buffer.append(ch);
tokenEnd++;
if(buffer.length()>0){
termAtt.setEmpty().append(buffer);
offsetAtt.setOffset(correctOffset(tokenStart),
correctOffset(tokenEnd));
return true;
}else
{
ci = input.read();
if(ci>64&&ci<91){
ci=ci+32;
}
ch = (char) ci;
}
} else {
buffer.append(ch);
tokenEnd++;
ci = input.read();
if(ci>64&&ci<91){
ci=ci+32;
}
ch = (char) ci;
}
}
}
@Override
public void reset() throws IOException {
super.reset();
tokenStart = tokenEnd = 0;
}
@Override
public void end() throws IOException {
super.end();
final int finalOffset = correctOffset(tokenEnd);
offsetAtt.setOffset(finalOffset, finalOffset);
}
}然后开始写分词器
package TEST;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
public class MyAnalyzer extends Analyzer{
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
return new TokenStreamComponents(new My(reader));
}
}最后测试下
package TEST;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
public class TestMy {
public static void main(String[] args) throws Exception {
MyAnalyzer ma = new MyAnalyzer();
String str = "Norther 雪中悍刀行 河北邯郸 AC DF-II-SDFzd(asd)/小时";
//MyChineseAnalyzer mc= new MyChineseAnalyzer(); 这是三劫散仙的分词器
TokenStream ts = ma.tokenStream("field", new StringReader(str));
CharTermAttribute c = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
System.out.println(c.toString());
}
ts.end();
ts.close();
}
} 测试结果: