lucene通过修改boost值改善index索引

来源:互联网

并不是所有的Document和Field是平等创建的。Document增量是个使得这种需求能够简单实现的一个特征。默认情况下,所有的 Document都没有增量,他们都有相同的增量因数1.0。通过改变某个Document的增量因数,可以让Lucene认为它比索引中的其他 Document更重要或更不重要。在索引的时候只需执行setBoost(float)方法。

看两个示例,就能明白其中的用法。

示例1,使用默认的的boost

[java] view plaincopy
  1. package com.cn;
  2. import org.apache.lucene.analysis.standard.StandardAnalyzer;
  3. import org.apache.lucene.document.Document;
  4. import org.apache.lucene.document.Field;
  5. import org.apache.lucene.index.IndexWriter;
  6. import org.apache.lucene.index.IndexWriterConfig;
  7. import org.apache.lucene.index.Term;
  8. import org.apache.lucene.search.IndexSearcher;
  9. import org.apache.lucene.search.Query;
  10. import org.apache.lucene.search.ScoreDoc;
  11. import org.apache.lucene.search.TermQuery;
  12. import org.apache.lucene.search.TopDocs;
  13. import org.apache.lucene.store.Directory;
  14. import org.apache.lucene.store.RAMDirectory;
  15. import org.apache.lucene.util.Version;
  16. public class TT {
  17. public static void main(String []args) throws Exception {
  18. String [] ids = {"1","2","3","4","5"};
  19. String [] province = {"shanghai","beijing","liaoning","liaoning","zhejiang"};
  20. String [] contents = {"shanghai is a city","beijing is a city","jinzhou is a city","shenyang is a city","hangzhou is a city"};
  21. String [] city = {"shanghai","beijing","jinzhou","shenyang","hangzhou"};
  22. Directory directory = new RAMDirectory();
  23. IndexWriter indexWriter = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_34, new StandardAnalyzer(Version.LUCENE_34)));
  24. for(int i = 0;i < ids.length;i++){
  25. Document doc = new Document();
  26. doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED));
  27. doc.add(new Field("province",province[i],Field.Store.YES,Field.Index.ANALYZED));
  28. doc.add(new Field("contents",contents[i],Field.Store.YES,Field.Index.ANALYZED));
  29. doc.add(new Field("city",city[i],Field.Store.YES,Field.Index.ANALYZED));
  30. indexWriter.addDocument(doc);
  31. }
  32. System.out.println("total:"+indexWriter.numDocs());
  33. indexWriter.close();
  34. queryMethod(directory,"contents","city");
  35. }
  36. public static void queryMethod(Directory directory,String item,String txt)throws Exception {
  37. Term term = new Term(item,txt);
  38. Query query = new TermQuery(term);
  39. IndexSearcher indexSearcher = new IndexSearcher(directory);
  40. TopDocs topDocs = indexSearcher.search(query, 10);
  41. System.out.println("it has "+topDocs.totalHits+" "+txt+" in "+item);
  42. ScoreDoc [] scoreDoc = topDocs.scoreDocs;
  43. for(int i=0;i<scoreDoc.length;i++){
  44. Document d = indexSearcher.doc(scoreDoc[i].doc);
  45. System.out.println("city:"+d.get("city"));
  46. System.out.println("contents:"+d.get("contents"));
  47. }
  48. }
  49. }

运行结果为:

total:5
it has 5 city in contents
province:shanghai  city:shanghai
contents:shanghai is a city
province:beijing  city:beijing
contents:beijing is a city
province:liaoning  city:jinzhou
contents:jinzhou is a city
province:liaoning  city:shenyang
contents:shenyang is a city
province:zhejiang  city:hangzhou
contents:hangzhou is a city

 

示例2,增加了boost,代码为

[java] view plaincopy
  1. package com.cn;
  2. import org.apache.lucene.analysis.standard.StandardAnalyzer;
  3. import org.apache.lucene.document.Document;
  4. import org.apache.lucene.document.Field;
  5. import org.apache.lucene.index.IndexWriter;
  6. import org.apache.lucene.index.IndexWriterConfig;
  7. import org.apache.lucene.index.Term;
  8. import org.apache.lucene.search.IndexSearcher;
  9. import org.apache.lucene.search.Query;
  10. import org.apache.lucene.search.ScoreDoc;
  11. import org.apache.lucene.search.TermQuery;
  12. import org.apache.lucene.search.TopDocs;
  13. import org.apache.lucene.store.Directory;
  14. import org.apache.lucene.store.RAMDirectory;
  15. import org.apache.lucene.util.Version;
  16. public class TT {
  17. public static void main(String []args) throws Exception {
  18. String [] ids = {"1","2","3","4","5"};
  19. String [] province = {"shanghai","beijing","liaoning","liaoning","zhejiang"};
  20. String [] contents = {"shanghai is a city","beijing is a city","jinzhou is a city","shenyang is a city","hangzhou is a city"};
  21. String [] city = {"shanghai","beijing","jinzhou","shenyang","hangzhou"};
  22. Directory directory = new RAMDirectory();
  23. IndexWriter indexWriter = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_34, new StandardAnalyzer(Version.LUCENE_34)));
  24. for(int i = 0;i < ids.length;i++){
  25. Document doc = new Document();
  26. doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED));
  27. doc.add(new Field("province",province[i],Field.Store.YES,Field.Index.ANALYZED));
  28. doc.add(new Field("contents",contents[i],Field.Store.YES,Field.Index.ANALYZED));
  29. doc.add(new Field("city",city[i],Field.Store.YES,Field.Index.ANALYZED));
  30. if(province[i].equals("liaoning")){
  31. if(city[i].equals("shenyang")){
  32. doc.setBoost(5.0f);
  33. }else{
  34. doc.setBoost(2.0f);
  35. }
  36. }
  37. indexWriter.addDocument(doc);
  38. }
  39. System.out.println("total:"+indexWriter.numDocs());
  40. indexWriter.close();
  41. queryMethod(directory,"contents","city");
  42. }
  43. public static void queryMethod(Directory directory,String item,String txt)throws Exception {
  44. Term term = new Term(item,txt);
  45. Query query = new TermQuery(term);
  46. IndexSearcher indexSearcher = new IndexSearcher(directory);
  47. TopDocs topDocs = indexSearcher.search(query, 10);
  48. System.out.println("it has "+topDocs.totalHits+" "+txt+" in "+item);
  49. ScoreDoc [] scoreDoc = topDocs.scoreDocs;
  50. for(int i=0;i<scoreDoc.length;i++){
  51. Document d = indexSearcher.doc(scoreDoc[i].doc);
  52. System.out.println("city:"+d.get("city"));
  53. System.out.println("contents:"+d.get("contents"));
  54. }
  55. }
  56. }

运行结果为: total:5
it has 5 city in contents
city:shenyang
contents:shenyang is a city
city:jinzhou
contents:jinzhou is a city
city:shanghai
contents:shanghai is a city
city:beijing
contents:beijing is a city
city:hangzhou
contents:hangzhou is a city

从两个例子中看出属于辽宁省的结果在靠前位置了,而且shenyang比jinzhou还要靠前。

发表评论