打开APP
userphoto
未登录

开通VIP,畅享免费电子书等14项超值服

开通VIP
lucene3.0的搜索
TermVector是Lucene 1.4新增的,TermVector保存Token.getPositionIncrement() 和Token.startOffset() 以及Token.endOffset() 信息.
各种参数说明:
Field.TermVector.NO:不保存term vectors
Field.TermVector.YES:保存term vectors
Field.TermVector.WITH_POSITIONS:保存term vectors.(保存值和token位置信息)
Field.TermVector.WITH_OFFSETS:保存term vectors.(保存值和Token的offset)
Field.TermVector.WITH_POSITIONS_OFFSETS:保存term vectors.(保存值和token位置信息和Token的offset)
 
lucene相关文章搜索的实现的代码
public class MoreLike {

       Analyzer analyzer = new IKAnalyzer(); //分词器选择

       Directory ramDir = new RAMDirectory();

 

       public void createRamIndex() throws CorruptIndexException,

                     LockObtainFailedException, IOException {

 

              IndexWriter writer = new IndexWriter(ramDir, analyzer,

                            IndexWriter.MaxFieldLength.LIMITED);
 

              Document doc1 = new Document();

              doc1.add(new Field("title", "wenhq", Field.Store.YES, Field.Index.ANALYZED));

              doc1.add(new Field("author", "callan", Field.Store.YES, Field.Index.ANALYZED));

              doc1.add(new Field("subject",

                            "wenhq.com是亲亲宝宝网站的域名,记录软件开发的经验", Field.Store.YES,

                            Field.Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));

 

              Document doc2 = new Document();

              doc2.add(new Field("title", "english", Field.Store.YES, Field.Index.ANALYZED));

              doc2.add(new Field("author", "wcq", Field.Store.YES, Field.Index.ANALYZED));

              doc2.add(new Field("subject", "学习english的人很多,亲亲宝宝网站的人也在学习", Field.Store.YES, Field.Index.ANALYZED,

                            TermVector.WITH_POSITIONS_OFFSETS));
 

              Document doc3 = new Document();

              doc3.add(new Field("title", "asp", Field.Store.YES, Field.Index.ANALYZED));

              doc3.add(new Field("author", "ca", Field.Store.YES, Field.Index.ANALYZED));

              doc3.add(new Field("subject", "asp是一种网站开发语言", Field.Store.YES, Field.Index.ANALYZED,

                            TermVector.WITH_POSITIONS_OFFSETS));
 
              writer.addDocument(doc1);
              writer.addDocument(doc2);
              writer.addDocument(doc3);
 
              writer.optimize();
              writer.close();
       }
 

       public void search() throws CorruptIndexException, IOException {

              IndexReader reader = IndexReader.open(ramDir);

              IndexSearcher searcher = new IndexSearcher(reader);

              Term term = new Term("title", "wenhq"); // 在title里查询wenhq词条

              TermQuery query = new TermQuery(term);

              TopScoreDocCollector collector = TopScoreDocCollector.create(10000,

                            false);

              searcher.search(query, collector);

              ScoreDoc[] hits = collector.topDocs().scoreDocs;

              for (int i = 0; i < hits.length; i++) {

                     Document doc = searcher.doc(hits[i].doc);

                     System.out.println("search: ");

                     System.out.println(doc.get("title") + "###" + doc.get("subject"));

                     morelikeSearch(reader, hits[i].doc);

              }
       }
 

       private void morelikeSearch(IndexReader reader, int id) throws IOException {

              System.out.println("moreLike search: ");

              // 根据这个document的id获取这个field的Term Vector

              // 信息,就是这个field分词之后在这个field里的频率、位置、等信息

              TermFreqVector vector = reader.getTermFreqVector(id, "subject");

 

              BooleanQuery query = new BooleanQuery();

 

              for (int i = 0; i < vector.size(); i++) {

                     TermQuery tq = new TermQuery(new Term("subject",

                                   vector.getTerms()[i])); // 获取每个term保存的Token

 

                     query.add(tq, BooleanClause.Occur.SHOULD);

              }
 

              IndexSearcher searcher = new IndexSearcher(ramDir);

              TopScoreDocCollector collector = TopScoreDocCollector.create(10000,

                            false);

              searcher.search(query, collector);

              ScoreDoc[] hits = collector.topDocs().scoreDocs;

              for (int i = 0; i < hits.length; i++) {

                     Document doc = searcher.doc(hits[i].doc);

                     System.out.println(doc.get("title") + "###" + doc.get("subject"));

              }
       }
      

       public static void main(String[] args) throws CorruptIndexException,

                     IOException {

              MoreLike t = new MoreLike();

              t.createRamIndex();
              t.search();
       }
}
具体的输出结果:
search:需要查询的文章内容
wenhq###wenhq.com是亲亲宝宝网站的域名,记录软件开发的经验
moreLike search: 相关的文章
wenhq###wenhq.com是亲亲宝宝网站的域名,记录软件开发的经验
english###学习english的人很多,亲亲宝宝网站的人也在学习
asp###asp是一种网站开发语言

No related posts.

本站仅提供存储服务,所有内容均由用户发布,如发现有害或侵权内容,请点击举报
打开APP,阅读全文并永久保存 查看更多类似文章
猜你喜欢
类似文章
【热】打开小程序,算一算2024你的财运
利用Lucene.net搭建站内搜索(3)
lucene爬数据库中的数据无非也是查询数据。所有我们用lucene搜索数据主要有下面几个步骤
Lucene 简介(一)
Lucene 索引和搜索过程核心类详解【基础】
Lucene 1.4索引文件格式-英文版
lucene、lucene.NET详细使用与优化详解
更多类似文章 >>
生活服务
热点新闻
分享 收藏 导长图 关注 下载文章
绑定账号成功
后续可登录账号畅享VIP特权!
如果VIP功能使用有故障,
可点击这里联系客服!

联系客服