lucene 查询示例
排序 Lucene默认按照相关度(score)排序,为了能支持其他的排序方式,比如日期,我们在add Field的时候,必须保证field被Index且不能被tokenized(分词),并且排序的只能是数字,日期,字符三种类型之一
实体类
public class Article {
privateStringid;
privateStringtitle;
privateStringkeyWords;
privateStringcontent;
privateintorder;
省略set..get方法
}组织数据
import java.util.ArrayList;
import java.util.List;
import com.company.project.entity.Article;
public class DATAUTIls {
public static List<Article> luceneDatas = new ArrayList<Article>();
static {
Article a1 = new Article();
a1.setContent("我们都是中国人" );
a1.setId("1");
a1.setTitle("法眼看中国是怎么样的一个中国" ) ;//有两个中国
a1.setKeyWords("中国,中国,中国") ;
a1.setOrder(1);
Article a2 = new Article();
a2.setContent("我们是两个中国 中国" );
a2.setId("2");
a2.setTitle("法眼看中国是怎么样的一个中国 中国" ) ;//有两个中国
a2.setKeyWords("中国,中国") ;
a2.setOrder(2);
Article a3 = new Article();
a3.setContent("我们都是中国人" );
a3.setId("3");
a3.setTitle("法眼看怎么样的一个中国" ) ;//有两个中国
a3.setKeyWords("中国 ") ;
a3.setOrder(3);
Article a4 = new Article();
a4.setContent("我们都是国中人" );
a4.setId("4");
a4.setTitle("法眼看" ) ;//有两个中国
a4.setKeyWords("无") ;
a4.setOrder(4);
luceneDatas.add(a1);
luceneDatas.add(a3);
luceneDatas.add(a2);
luceneDatas.add(a4);
}
}
建造索引
import java.io.File;
import java.io.IOException;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.NRTCachingDirectory;
import org.apache.lucene.util.Version;
import com.company.project.entity.Article;
public class IndexRunner {
private String INDEX_STORe_PATH = "D:\\workplace\\company\\mylucene\\indexstore";
public IndexRunner(){};
public IndexRunner(String index_path)
{
this.INDEX_STORe_PATH = index_path;
File dir = new File(index_path);
if(dir.exists())
{
dir.mkdir();
}
}
//创建索引
public void createIndex(List<Article> datas,boolean isCreate) throws IOException
{
//待创建得文档目录
Directory dir = FSDirectory.open(new File(INDEX_STORe_PATH));
//选择得分词工具
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_34);
//建立索引的配置类,包含了一个解析器
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_34, analyzer);
//设置我们的解析器是新建还是追加更新
if(isCreate){
iwc.setOpenMode(OpenMode.CREATE);//每次建立都覆盖原来的索引
}
else{
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);//每次都追加更新
}
NRTCachingDirectory cachedFSDir = new NRTCachingDirectory(dir, 5.0, 60.0);
iwc.setMergeScheduler(cachedFSDir.getMergeScheduler());
//索引的建立类 第一个参数索引的存放位置,第二个参数索引的配置对象
IndexWriter writer = new IndexWriter(dir, iwc);
for(int i=0;i<datas.size();i++)
{
Article article = datas.get(i );
/*
* Field.Store.YES:存储字段值(未分词前的字段值) Field.Store.NO:不存储,存储与索引没有关系
* Field.Store.COMPRESS:压缩存储,用于长文本或二进制,但性能受损 Field.Index.ANALYZED:分词建索引
* Field.Index.ANALYZED_NO_NORMS:分词建索引,但是Field的值不像通常那样被保存,而是只取一个byte,这样节约存储空间
* Field.Index.NOT_ANALYZED:不分词且索引
* Field.Index.NOT_ANALYZED_NO_NORMS:不分词建索引,Field的值去一个byte保存
*/
Field f0 = new Field("title", article.getTitle(), Field.Store.YES, Field.Index.ANALYZED);
Field f1 = new Field("content",article.getContent(),Field.Store.YES,Field.Index.ANALYZED);
Field f2 = new Field("order",String.valueOf(article.getOrder()),Field.Store.YES,Field.Index.NOT_ANALYZED);
Field f3 = new Field("id",String.valueOf(article.getId()),Field.Store.YES,Field.Index.NOT_ANALYZED);
Document doc = new Document();
doc.add(f0) ;
doc.add(f1);
doc.add(f2);
doc.add(f3);
writer.addDocument(doc);
}
//这个方法在新增索引的情况会很有用,就是讲原来散落的索引文件重新进行整理合并!
//
writer.forceMerge(1);
writer.close();
System.out.println("索引创建成功");
}
public static void main(String[] args) {
IndexRunner indexRunner = new IndexRunner();
try {
indexRunner.createIndex(DATAUTIls.luceneDatas,true);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}查询
此处有三种查询,一种是多字段查询一个关键字, 一种是多字段组合查询,还有一种是分页查询
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import com.company.project.entity.Article;
public class SearchRunner {
private static String PATH = "D:\\workplace\\company\\mylucene\\indexstore";
public static void main(String [] arg) throws Exception{
String[] queryFileds = { "title", "content" };
String queryString = "中国";
//SearchRunner.searchList(queryFileds, queryString );
// SearchRunner.combinationSearch();
SearchRunner.pagingSearch("中", null);
}
/**
* 在多个字段查找同一个值
*/
public static void searchList(String[] queryFileds,String queryString) throws Exception
{
// 查询的字符串:输入不存在的字符串是查询不到的,如:中国
IndexReader reader = IndexReader.open(FSDirectory.open(new File(PATH)));
IndexSearcher searcher = new IndexSearcher(reader);
Query query = LuceneUtils.createQuery(queryFileds, queryString);
// 在搜索器中进行查询
// 对查询内容进行过滤
Filter filter = null;
// 一次在索引器查询多少条数据
int queryCount = 100;
TopDocs results = searcher.search(query, filter, queryCount);
System.out.println("总符合: " + results.totalHits + "条数!");
// 显示记录
for (ScoreDoc sr : results.scoreDocs)
{
// 文档编号
int docID = sr.doc;
// 真正的内容
Document doc = searcher.doc(docID);
System.out.println("inof = " + doc.get("title"));
System.out.println("info2 = " + doc.get("content"));
}
}
public static void combinationSearch() throws CorruptIndexException, IOException, ParseException{
IndexReader reader = IndexReader.open(FSDirectory.open(new File(PATH)));
IndexSearcher searcher = new IndexSearcher(reader);
//选择得分词工具
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_34);
QueryParser parser = new QueryParser(Version.LUCENE_34, "content",
analyzer);
//注意此处AND一定要大写
Query query = parser.parse("content:中 国 AND title:中 国");
// 一次在索引器查询多少条数据
int queryCount = 100;
// Sort sort = new Sort(new SortField("order",SortField.DOUBLE,false)); //排序 false 升序 true降序
//TopDocs results = searcher.search(query, queryCount,sort);
TopDocs results = searcher.search(query, queryCount);
System.out.println("总符合: " + results.totalHits + "条数!");
// 显示记录
for (ScoreDoc sr : results.scoreDocs)
{
//Sort(field,true)
// 文档编号
int docID = sr.doc;
// 真正的内容
Document doc = searcher.doc(docID);
System.out.println("id="+doc.get("id")+"\torder="+doc.get("order")+"\ttitle = " + doc.get("title")+"\tcontent = " + doc.get("content"));
}
}
// 分页查询
public static Map pagingSearch(String title,String content) throws CorruptIndexException, IOException, ParseException, InvalidTokenOffsetsException{
Map pager = new HashMap();
List<Article> blogList=new ArrayList<Article>() ;
TokenStream tokenStream=null;
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_34);
//获取IndexSearcher 对象
IndexReader reader = IndexReader.open(FSDirectory.open(new File(PATH)));
IndexSearcher indexSearch = new IndexSearcher(reader);
QueryParser queryParser = new QueryParser(Version.LUCENE_34, "content", analyzer);
//搜索条件的结合
String str="";
if(title!=null &&title.length()>0){
str="title:"+title;
}
if(content!=null &&content.length()>0){
if(str.trim().length()>0)
{
str +=" AND";
}
str="content:"+content;
}
//设置搜索条件
Query query=queryParser.parse(str);
//查询搜索引擎
TopDocs result = indexSearch.search(query, 10);
//上一页的最后一个document索引 第一页为0,其余也该页的起始记录条数
int index=2;
ScoreDoc scoreDoc=null;
//如果当前页是第一页面scoreDoc=null。
if(index>0){
//因为索引是从0开始所以要index-1
scoreDoc=result.scoreDocs[index-1];
}
//分页处理
int pageSize = 2;
TopDocs hits= indexSearch.searchAfter(scoreDoc, query, pageSize);
//设置分页的总记录数
//循环hits.scoreDocs数据,并使用indexSearch.doc方法把Document还原,再拿出对应的字段的值
for (int i = 0; i < hits.scoreDocs.length; i++) {
ScoreDoc sdoc = hits.scoreDocs[i];
Document doc = indexSearch.doc(sdoc.doc);
Article article = new Article();
String stitle = doc.get("title");
String scontent = doc.get("content");
String id = doc.get("id");
//加亮处理
SimpleHTMLFormatter simplehtml=new SimpleHTMLFormatter("<font color='red'>", "</font>");
Highlighter highlighter = new Highlighter(simplehtml,new QueryScorer(query));
if(title!=null){
tokenStream = analyzer.tokenStream("title",new StringReader(title));
String highLightText = highlighter.getBestFragment(tokenStream, title);
article.setTitle(highLightText==null?title:highLightText);
}else
{
article.setTitle(stitle);
}
if(content!=null){
tokenStream = analyzer.tokenStream("content",new StringReader(content));
String highLightText = highlighter.getBestFragment(tokenStream, content);
article.setContent(highLightText==null?title:highLightText);
}else
{
article.setContent(scontent);
}
article.setId(id);
System.out.println(article);
blogList.add(article);
}
pager.put("content",hits.totalHits);
pager.put("data",blogList);
return pager;
}
} 相关推荐
renjinlong 2020-09-03
Jacry 2020-07-04
IceStreamLab 2020-06-26
mengyue 2020-06-09
PasserbyX 2020-05-16
mameng 2020-05-12
心丨悦 2020-05-06
编码之路 2020-05-03
mengyue 2020-05-02
qiuzhuoxian 2020-02-23
编码之路 2020-02-20
lionelf 2020-02-03
TyCoding 2020-02-01
heniancheng 2020-01-31
某某某 2020-01-30
PinkBean 2020-01-29
某某某 2020-01-12
编码之路 2020-01-01
itmale 2020-01-01