lucene3.2 简单的建立索引和查询

由于lucene3.2发布不久找个用法都比较困难啊,看了其他人以前写的早期版本的用法以及看api来学习
由于新版本,以前版本很多方法都不能用了


学习之后终于写出了个能用的代码
首先是建索引:

  1. import java.io.BufferedReader;
  2. import java.io.File;
  3. import java.io.FileReader;
  4. import java.io.IOException;
  5. import java.util.Date;
  6.  
  7. import org.apache.lucene.analysis.standard.StandardAnalyzer;
  8. import org.apache.lucene.document.Document;
  9. import org.apache.lucene.document.Field;
  10. import org.apache.lucene.index.IndexWriter;
  11. import org.apache.lucene.index.IndexWriterConfig;
  12. import org.apache.lucene.store.FSDirectory;
  13. import org.apache.lucene.util.Version;
  14.  
  15. public class Main {
  16.  static File filedir = new File("e:\\work\\java\\crawl\\test");
  17.  static File indexdir = new File("e:\\work\\java\\crawl\\index");
  18.  static IndexWriter writer;
  19.  
  20.  public static void main(String[] args) {
  21.   try {
  22.    Date start = new Date();
  23.    writeToIndex();
  24.    Date end = new Date();
  25.    System.out.println("建索引时间" + (end.getTime() - start.getTime())
  26.      + "毫秒");
  27.   } catch (Exception e) {
  28.    e.printStackTrace();
  29.   }
  30.  }
  31.  
  32.  public static int writeToIndex() throws Exception {
  33.   StandardAnalyzer aa = new StandardAnalyzer(Version.LUCENE_32);
  34.   writer = new IndexWriter(FSDirectory.open(indexdir),
  35.     new IndexWriterConfig(Version.LUCENE_32, aa));
  36.   indexDirectory(filedir);
  37.   int numIndexed = writer.numDocs();
  38.   System.out.println(numIndexed);
  39.   writer.optimize();
  40.   writer.close();
  41.   return numIndexed;
  42.  }
  43.  
  44.  private static void indexDirectory(File dir) throws IOException {
  45.   File[] files = dir.listFiles();
  46.   for (int i = 0; i < files.length; i++) {
  47.    File f = files[i];
  48.    if (f.isDirectory()) {
  49.     indexDirectory(f);
  50.    } else if (f.getName().endsWith(".txt")) {
  51.     indexFile(f);
  52.    }
  53.   }
  54.  }
  55.  
  56.  private static void indexFile(File f) throws IOException {
  57.   if (f.isHidden() || !f.exists() || !f.canRead())
  58.    return;
  59.   Document doc = new Document();
  60.   BufferedReader in=new BufferedReader(new FileReader(f));
  61.   String temp;
  62.   StringBuffer sb=new StringBuffer();
  63.   while((temp=in.readLine())!=null)
  64.    sb.append(temp);
  65.   doc.add(new Field("content", sb.toString(), Field.Store.YES,
  66.     Field.Index.ANALYZED));
  67.   doc.add(new Field("name", f.getCanonicalPath(), Field.Store.YES,
  68.     Field.Index.ANALYZED));
  69.   writer.addDocument(doc);
  70.  }
  71. }

然后说明下
Field.Store.YES:存储字段值(未分词前的字段值)
Field.Store.NO:不存储,存储与索引没有关系
Field.Store.COMPRESS:压缩存储,用于长文本或二进制,但性能受损
Field.Index.ANALYZED:分词建索引
Field.Index.ANALYZED_NO_NORMS:分词建索引,但是Field的值不像通常那样被保存,而是只取一个byte,这样节约存储空间
Field.Index.NOT_ANALYZED:不分词且索引
Field.Index.NOT_ANALYZED_NO_NORMS:不分词建索引,Field的值去一个byte保存TermVector表示文档的条目(由一个Document和Field定位)和它们在当前文档中所出现的次数
Field.TermVector.YES:为每个文档(Document)存储该字段的TermVector
Field.TermVector.NO:不存储TermVector
Field.TermVector.WITH_POSITIONS:存储位置
Field.TermVector.WITH_OFFSETS:存储偏移量
Field.TermVector.WITH_POSITIONS_OFFSETS:存储位置和偏移量

然后就是查询代码了

  1. import java.io.File;
  2. import java.util.Date;
  3.  
  4. import org.apache.lucene.analysis.standard.StandardAnalyzer;
  5. import org.apache.lucene.document.Document;
  6. import org.apache.lucene.queryParser.QueryParser;
  7. import org.apache.lucene.search.IndexSearcher;
  8. import org.apache.lucene.search.Query;
  9. import org.apache.lucene.search.ScoreDoc;
  10. import org.apache.lucene.search.TopScoreDocCollector;
  11. import org.apache.lucene.store.FSDirectory;
  12. import org.apache.lucene.util.Version;
  13.  
  14. public class Main {
  15.  static File indexdir = new File("e:\\work\\java\\crawl\\index");
  16.  static String KEYWORD = "高新区";
  17.  static int TOP_NUM = 5;
  18.  
  19.  public static void main(String[] args) throws Exception {
  20.   search(KEYWORD);
  21.  }
  22.  
  23.  public static void search(String q) throws Exception {
  24.   IndexSearcher is = new IndexSearcher(FSDirectory.open(indexdir), true);
  25.   String field = "content";
  26.   QueryParser parser = new QueryParser(Version.LUCENE_32, field,
  27.     new StandardAnalyzer(Version.LUCENE_32));
  28.   Query query = parser.parse(q);
  29.   TopScoreDocCollector collector = TopScoreDocCollector.create(TOP_NUM,
  30.     true);
  31.   long start = new Date().getTime();
  32.   is.search(query, collector);
  33.   ScoreDoc[] hits = collector.topDocs().scoreDocs;
  34.   System.out.println(hits.length);
  35.   for (int i = 0; i < hits.length; i++) {
  36.    Document doc = is.doc(hits[i].doc);
  37.    System.out.println(doc.get("content"));
  38.   }
  39.   long end = new Date().getTime();
  40.   System.out.println("Found " + collector.getTotalHits()
  41.     + " document(s) (in " + (end - start)
  42.     + " milliseconds) that matched query '" + q + "':");
  43.  }
  44. }

如果是多个关键字查询就这样

  1. String[] queries = { "高新区", "1000" };
  2. String[] fields = { "content", "money" };
  3. BooleanClause.Occur[] clauses = { BooleanClause.Occur.MUST,
  4.     BooleanClause.Occur.MUST};
  5. Query query = MultiFieldQueryParser.parse(Version.LUCENE_32, queries,
  6. fields, clauses, new StandardAnalyzer(Version.LUCENE_32));

代码还是比较好懂的就不解释了。。。

您可能喜欢:
我猜您可能还喜欢:
, ,

《lucene3.2 简单的建立索引和查询》有 8 条评论

  1. orbbyrp | #2

    看有没有头像

Trackbacks/Pingbacks:

  1. python中文分词 | 吃杂烩
  2. 写python的c扩展简介 | 吃杂烩

发表评论