lucene兑现结果分组统计,类似group by
lucene实现结果分组统计,类似group by
import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.search.DuplicateFilter; import org.apache.lucene.search.Filter; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; public class DuplicateFilterTest { public static void main(String[] args) { Directory dir = new RAMDirectory(); Document doc = new Document(); doc.add(new Field("id", "binbin", Store.YES, Index.NOT_ANALYZED)); doc.add(new Field("string", "haha", Store.YES, Index.NOT_ANALYZED)); doc.add(new Field("time", "20100801", Store.YES, Index.NOT_ANALYZED)); doc.add(new Field("duplicate", "123456", Store.YES, Index.NOT_ANALYZED)); Document doc1 = new Document(); doc1.add(new Field("id", "yaoyao", Store.YES, Index.NOT_ANALYZED)); doc1.add(new Field("string", "haha", Store.YES, Index.NOT_ANALYZED)); doc1.add(new Field("time", "20100801", Store.YES, Index.NOT_ANALYZED)); doc1.add(new Field("duplicate", "123456", Store.YES,Index.NOT_ANALYZED)); Document doc11 = new Document(); doc11.add(new Field("id", "liufeng", Store.YES, Index.NOT_ANALYZED)); doc11.add(new Field("string", "haha", Store.YES, Index.NOT_ANALYZED)); doc11.add(new Field("time", "20100801", Store.YES, Index.NOT_ANALYZED)); doc11.add(new Field("duplicate", "123456", Store.YES,Index.NOT_ANALYZED)); Document doc2 = new Document(); doc2.add(new Field("id", "zhangjian", Store.YES, Index.NOT_ANALYZED)); doc2.add(new Field("string", "haha", Store.YES, Index.NOT_ANALYZED)); doc2.add(new Field("time", "20100801", Store.YES, Index.NOT_ANALYZED)); doc2.add(new Field("duplicate", "123455", Store.YES,Index.NOT_ANALYZED)); Document doc3 = new Document(); doc3.add(new Field("id", "liweicheng", Store.YES, Index.NOT_ANALYZED)); doc3.add(new Field("string", "haha", Store.YES, Index.NOT_ANALYZED)); doc3.add(new Field("time", "20100801", Store.YES, Index.NOT_ANALYZED)); doc3.add(new Field("duplicate", "123451", Store.YES,Index.NOT_ANALYZED)); try { IndexWriter indexWriter = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_29), true, MaxFieldLength.LIMITED); indexWriter.addDocument(doc); indexWriter.addDocument(doc1); indexWriter.addDocument(doc11); indexWriter.addDocument(doc2); indexWriter.addDocument(doc3); indexWriter.close(); Query query = new TermQuery(new Term("string", "haha")); Filter filter = new DuplicateFilter("duplicate"); IndexSearcher indexSearcher = new IndexSearcher(dir); TopDocs top = indexSearcher.search(query, filter, 200); ScoreDoc[] scoreDocs = top.scoreDocs; for (ScoreDoc scoreDoc : scoreDocs) { Document rdoc = indexSearcher.doc(scoreDoc.doc); System.out.print("id:"+rdoc.get("id") +" 排重ID:" +rdoc.get("duplicate")); Query queryDuplicate = new TermQuery(new Term("duplicate", rdoc.get("duplicate"))); System.out.println("转载:"+ indexSearcher.search(queryDuplicate, 100).totalHits ); } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
id:liufeng 排重ID:123456转载:3 id:zhangjian 排重ID:123455转载:1 id:liweicheng 排重ID:123451转载:1