由于lucene3.2发布不久找个用法都比较困难啊,看了其他人以前写的早期版本的用法以及看api来学习
由于新版本,以前版本很多方法都不能用了
学习之后终于写出了个能用的代码
首先是建索引:
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Date;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class Main {
static File filedir = new File("e:\\work\\java\\crawl\\test");
static File indexdir = new File("e:\\work\\java\\crawl\\index");
static IndexWriter writer;
public static void main(String[] args) {
try {
Date start = new Date();
writeToIndex();
Date end = new Date();
System.out.println("建索引时间" + (end.getTime() - start.getTime())
+ "毫秒");
} catch (Exception e) {
e.printStackTrace();
}
}
public static int writeToIndex() throws Exception {
StandardAnalyzer aa = new StandardAnalyzer(Version.LUCENE_32);
writer = new IndexWriter(FSDirectory.open(indexdir),
new IndexWriterConfig(Version.LUCENE_32, aa));
indexDirectory(filedir);
int numIndexed = writer.numDocs();
System.out.println(numIndexed);
writer.optimize();
writer.close();
return numIndexed;
}
private static void indexDirectory(File dir) throws IOException {
File[] files = dir.listFiles();
for (int i = 0; i < files.length; i++) {
File f = files[i];
if (f.isDirectory()) {
indexDirectory(f);
} else if (f.getName().endsWith(".txt")) {
indexFile(f);
}
}
}
private static void indexFile(File f) throws IOException {
if (f.isHidden() || !f.exists() || !f.canRead())
return;
Document doc = new Document();
BufferedReader in=new BufferedReader(new FileReader(f));
String temp;
StringBuffer sb=new StringBuffer();
while((temp=in.readLine())!=null)
sb.append(temp);
doc.add(new Field("content", sb.toString(), Field.Store.YES,
Field.Index.ANALYZED));
doc.add(new Field("name", f.getCanonicalPath(), Field.Store.YES,
Field.Index.ANALYZED));
writer.addDocument(doc);
}
}
然后说明下
Field.Store.YES:存储字段值(未分词前的字段值)
Field.Store.NO:不存储,存储与索引没有关系
Field.Store.COMPRESS:压缩存储,用于长文本或二进制,但性能受损
Field.Index.ANALYZED:分词建索引
Field.Index.ANALYZED_NO_NORMS:分词建索引,但是Field的值不像通常那样被保存,而是只取一个byte,这样节约存储空间
Field.Index.NOT_ANALYZED:不分词且索引
Field.Index.NOT_ANALYZED_NO_NORMS:不分词建索引,Field的值去一个byte保存TermVector表示文档的条目(由一个Document和Field定位)和它们在当前文档中所出现的次数
Field.TermVector.YES:为每个文档(Document)存储该字段的TermVector
Field.TermVector.NO:不存储TermVector
Field.TermVector.WITH_POSITIONS:存储位置
Field.TermVector.WITH_OFFSETS:存储偏移量
Field.TermVector.WITH_POSITIONS_OFFSETS:存储位置和偏移量
然后就是查询代码了
import java.io.File;
import java.util.Date;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class Main {
static File indexdir = new File("e:\\work\\java\\crawl\\index");
static String KEYWORD = "高新区";
static int TOP_NUM = 5;
public static void main(String[] args) throws Exception {
search(KEYWORD);
}
public static void search(String q) throws Exception {
IndexSearcher is = new IndexSearcher(FSDirectory.open(indexdir), true);
String field = "content";
QueryParser parser = new QueryParser(Version.LUCENE_32, field,
new StandardAnalyzer(Version.LUCENE_32));
Query query = parser.parse(q);
TopScoreDocCollector collector = TopScoreDocCollector.create(TOP_NUM,
true);
long start = new Date().getTime();
is.search(query, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
System.out.println(hits.length);
for (int i = 0; i < hits.length; i++) {
Document doc = is.doc(hits[i].doc);
System.out.println(doc.get("content"));
}
long end = new Date().getTime();
System.out.println("Found " + collector.getTotalHits()
+ " document(s) (in " + (end - start)
+ " milliseconds) that matched query '" + q + "':");
}
}
如果是多个关键字查询就这样
String[] queries = { "高新区", "1000" };
String[] fields = { "content", "money" };
BooleanClause.Occur[] clauses = { BooleanClause.Occur.MUST,
BooleanClause.Occur.MUST};
Query query = MultiFieldQueryParser.parse(Version.LUCENE_32, queries,
fields, clauses, new StandardAnalyzer(Version.LUCENE_32));
代码还是比较好懂的就不解释了。。。
瑞瑞好勤劳啊
看有没有头像
我去竟然有两个号。。。
不明真相,而且没有头像啊
那就去gravatar自己弄个头像就行了
弄了,当时没显出来……现在好了