Lucene 3.0.1 全文检目录擎的架构对文件，数据库建索引，及查询(高亮显示)

Lucene 3.0.1 全文检索引擎的架构对文件，数据库建索引，及查询(高亮显示)
lucene是apache软件基金会4 jakarta项目组的一个子项目，是一个开放源代码的全文检索引擎工具包，即它不是一个完整的全文检索引擎，而是一个全文检索引擎的架构，提供了完整的查询引擎和索引引擎，部分文本分析引擎（英文与德文两种西方语言）。Lucene的目的是为软件开发人员提供一个简单易用的工具包，以方便的在目标系统中实现全文检索的功能，或者是以此为基础建立起完整的全文检索引擎。
查询关键词 “唐山” 之后效果图： Lucene 3.0.1 全文检目录擎的架构对文件，数据库建索引，及查询(高亮显示)

对文件创建索引及查询
创建索引 Lucene 3.0(第一步)

package com.gjw.lecence;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.Date;

import jxl.Sheet;
import jxl.Workbook;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.textmining.text.extraction.WordExtractor;

/**
 * 创建索引 Lucene 3.0(第一步)
 * 
 * @author RenWeigang
 * 
 * @version 2010.12.13
 * 
 */
public class Indexer {
    
    //保存索引文件的地方
    private static String INDEX_DIR = "E:\\index";
    //将要搜索TXT文件的地方
    private static String DATA_DIR = "E:\\rr";
    
    public static void main(String[] args) throws Exception {
        long start = new Date().getTime();
        int numIndexed = index(new File(INDEX_DIR), new File(DATA_DIR));
        long end = new Date().getTime();
        System.out.println("Indexing " + numIndexed + " files took "
                + (end - start) + " milliseconds");
    }

    /**
     * 索引dataDir下文件，并储存在indexDir下，返回索引的文件数量
     * 
     * @param indexDir
     * @param dataDir
     * @return
     * @throws IOException
     * isDirectory() 判断
     */
    public static int index(File indexDir, File dataDir) throws IOException {
        if (!dataDir.exists() || !dataDir.isDirectory()) {
            throw new IOException(dataDir
                    + " does not exist or is not a directory");
        }
        /**
         * 创建IndexWriter对象,
         * 第一个参数是Directory,也可以为：Directory dir = new SimpleFSDirectory(new File(indexDir));
         * 第二个是分词器,
         * 这个IndexWriter是针对文件系统的
         * 第三个参数是指:   如果指定为true，表示重新创建索引库，如果已存在，就删除后再创建;
         *               指定为false，表示追加(默认值)
         *               如果不存在，就抛异常.
         * 第四表示表示分词的最大值，比如说new MaxFieldLength(2)，
         * 就表示两个字一分，一般用IndexWriter.MaxFieldLength.LIMITED
         *     
         */

        Directory dir = new SimpleFSDirectory(indexDir);

        IndexWriter writer = new IndexWriter(dir,
                new StandardAnalyzer(Version.LUCENE_30), true,
                IndexWriter.MaxFieldLength.LIMITED);
        indexDirectory(writer, dataDir);
        //查看IndexWriter里面有多少个索引 
        int numIndexed = writer.numDocs();
        writer.optimize();//优化
        writer.commit();//提交
        writer.close();//关闭 使其 不占用资源
        return numIndexed;
    }

    /**
     * 循环遍历dir下的所有文件并进行索引
     * 
     * @param writer
     * @param dir
     * @throws IOException
     */
    private static void indexDirectory(IndexWriter writer, File dir)
            throws IOException {

        File[] files = dir.listFiles();

        for (int i = 0; i < files.length; i++) {
            if (files[i].isDirectory()) {//如果path表示的是一个目录则返回true
                //递归
                indexDirectory(writer,files[i]);
            } else if(files[i].getName().endsWith(".txt")) {
                indexTxtFile(writer,files[i]);
            }else if(files[i].getName().endsWith(".doc")) {
                indexWordFile(writer,files[i]);
            }else if(files[i].getName().endsWith(".xls")) {
                indexExcelFile(writer,files[i]);
            }
        }
    }

    /**
     * 对excel2003文件进行索引
     * 读取word文件有两种方法，用jacob包，可以修改生成word文件内容。
     * 如果只读取word里的文本内容的话，可以用poi读取word文件，
     * 先到http://www.ibiblio.org/maven2/org/textmining/tm-extractors/
     * 下载tm-extractors-0.4.jar包
     * */
    private static void indexExcelFile(IndexWriter writer, File f){
        if (f.isHidden() || !f.exists() || !f.canRead()) {
            return;
        }
        Workbook rwb = null;
        try
        {
            InputStream is=new FileInputStream(f);
            //声名一个工作薄
             rwb= Workbook.getWorkbook(is);
            //获得工作薄的个数
            rwb.getNumberOfSheets();
            //在Excel文档中，第一张工作表的缺省索引是0
            Sheet st = rwb.getSheet(0);
            //通用的获取cell值的方式,getCell(int column, int row) 行和列
            int rows=st.getRows();
            int cols=st.getColumns();
            System.out.println("当前工作表的名字:"+st.getName());
            System.out.println("总行数:"+rows);
            System.out.println("总列数:"+cols);
            String content = "";
          for(int i=0;i<rows;i++){
              for (int j = 0; j < cols; j++)
            {
                content+=st.getCell(j,i).getContents();
            }
          }
          //System.out.println("--------->>excel内容："+content);
          
          Document doc = new Document();
          doc.add(new Field("contents",content, Field.Store.YES,Field.Index.ANALYZED));
          doc.add(new Field("filename", f.getCanonicalPath(), Field.Store.YES,Field.Index.ANALYZED));
          
          writer.addDocument(doc);
          
        }
        catch(Exception e)
        {
            e.printStackTrace();
             System.out.println("出错了");
        }finally{
          rwb.close();
        }
    }
    
    /**
     * 对word2003文件进行索引
     * 读取word文件有两种方法，用jacob包，可以修改生成word文件内容。
     * 如果只读取word里的文本内容的话，可以用poi读取word文件，
     * 先到http://www.ibiblio.org/maven2/org/textmining/tm-extractors/
     * 下载tm-extractors-0.4.jar包
     * */
    private static void indexWordFile(IndexWriter writer, File f){
        if (f.isHidden() || !f.exists() || !f.canRead()) {
            return;
        }
        try{
            System.out.println("Indexing " + f.getCanonicalPath());
            FileInputStream in = new FileInputStream (f);
            WordExtractor extractor = new WordExtractor(); 
            String str = extractor.extractText(in); 
            //System.out.println(str); //输出内容
            
            Document doc = new Document();
            doc.add(new Field("contents",str, Field.Store.YES,Field.Index.ANALYZED));
            doc.add(new Field("filename", f.getCanonicalPath(), Field.Store.YES,Field.Index.ANALYZED));
            
            writer.addDocument(doc);
        }catch(Exception e){
            e.printStackTrace();
        }
    }
    
    /**
     * 对单个txt文件进行索引
     * 
     * @param writer
     * @param f
     * @throws IOException
     */
    private static void indexTxtFile(IndexWriter writer, File f)
            throws IOException {

        if (f.isHidden() || !f.exists() || !f.canRead()) {
            return;
        }
        FileReader fr = new FileReader(f);
        BufferedReader br = new BufferedReader(fr);
        System.out.println("Indexing " + f.getCanonicalPath());
        String content = "";
        for (int i = 0; i < 30; i++)
        {
            if (br.readLine()!=null)
            {
                content=content+br.readLine();
            }
        }
        Document doc = new Document();
        doc.add(new Field("contents",content, Field.Store.YES,Field.Index.ANALYZED));
        doc.add(new Field("filename", f.getCanonicalPath(), Field.Store.YES,Field.Index.ANALYZED));
        
        /**
         * Field.Index有五个属性，分别是： 
         * Field.Index.ANALYZED：分词索引 
         * Field.Index.NOT_ANALYZED：分词进行索引，如作者名，日期等，Rod Johnson本身为一单词，不再需要分词。 
         * Field.Index.NO：不进行索引，存放不能被搜索的内容如文档的一些附加属性如文档类型, URL等。 
         * Field.Index.NOT_ANALYZED_NO_NORMS：不使用分词索引，不使用存储规则。 
         * Field.Index.ANALYZED_NO_NORMS：使用分词索引，不使用存储规则。
         */
        /**
         * 如果稍微留心就可以注意到,索引库的文件不是一层不变的,
         * cfs类型的文件在不停的有规律地增加,这个文件多了以后,
         * 会影响到搜索的效率,因为它要打开多个文件,
         * 所以我们又要想办法让它合并成一个文件
         * */
        writer.addDocument(doc);
    }
}

搜索索引 Lucene 3.0(第二步)

package com.gjw.lecence;

import java.io.File;
import java.util.Date;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/**
 * 搜索索引 Lucene 3.0(第二步)
 * 
 * @author RenWeigang
 * 
 * @version 2010.12.13
 * 
 */
public class Searcher {
    
    //保存索引的地方
    private static String INDEX_DIR = "E:\\index";
    
    private static String KEYWORD = "高军威";
    private static int TOP_NUM = 100;

    public static void main(String[] args) throws Exception {
        
        File indexDir = new File(INDEX_DIR);
        if (!indexDir.exists() || !indexDir.isDirectory()) {
            throw new Exception(indexDir
                    + " does not exist or is not a directory.");
        }

        searchs(indexDir, KEYWORD);
    }
    
    /**
     * 多查詢
     * 
     * @param indexDir
     *        索引目录地址
     * @param q
     *         要查询的字符串
     * @throws Exception
     */
    public static void searchs(File indexDir, String q) throws Exception {
        
        //创建 IndexSearcher对象，相比IndexWriter对象，这个参数就要提供一个索引的目录就行了 
        IndexSearcher indexSearch = new IndexSearcher(FSDirectory.open(indexDir), true);// read-only
        //在建立索引时,存在IndexWriter对象中的
        /**
         *  创建QueryParser对象,
         *  第一个参数表示Lucene的版本,
         *  第二个表示搜索Field的字段,
         *  第三个表示搜索使用分词器 
         */
        String[] fields = {"contents","filename"};
        MultiFieldQueryParser parser = new MultiFieldQueryParser(Version.LUCENE_30, fields, new StandardAnalyzer(Version.LUCENE_30));
        //生成Query对象   
        Query query = parser.parse(q);
      //---------------------高亮显示---------------------------------------
        /**
         * Formatter:设置高亮器的格式,无参构造函数表示使用<b>标签
         * Scorer:Highlighter需要知道哪些关键词是需要高亮的,需要需要查询条件
         */
        Formatter formatter = new SimpleHTMLFormatter("<font color='red'>","</font>");
        QueryScorer scorer = new QueryScorer(query);
        Highlighter highlighter = new Highlighter(formatter,scorer);
        //除了上面两个以外,还需要生成一段摘要,以便搜索的时候显示,指定摘要的大小为20个字符
        Fragmenter fragmenter = new SimpleFragmenter(20);
        highlighter.setTextFragmenter(fragmenter);
        //------------------------------------------------------------
        TopScoreDocCollector collector = TopScoreDocCollector.create(TOP_NUM,false);

        // start time
        long start = new Date().getTime();

        indexSearch.search(query,collector);
        //搜索结果TopScoreDocCollector里面有 TopDocs,TopDocs里面有scoreDocs[]数组，里面保存着索引值.   
        ScoreDoc[] hits = collector.topDocs().scoreDocs;
        
        System.out.println("找到了"+hits.length+"个");
        
        //循环ScoreDoc数据，并使用indexSearch.doc方法把Document还原，再拿出对应的字段的值  
        for (int i = 0; i < hits.length; i++) {
            ScoreDoc scoreDoc = hits[i];
            //浮点类型的得分
            float score = scoreDoc.score;
            // new method is.doc()
            Document doc = indexSearch.doc(hits[i].doc);
            String text=highlighter.getBestFragment(new StandardAnalyzer(Version.LUCENE_30),"filename", doc.get("filename"));
            String text2=highlighter.getBestFragment(new StandardAnalyzer(Version.LUCENE_30),"contents", doc.get("contents")); 
            if(text==null)text=doc.get("filename");
            if(text2==null)text2=doc.get("contents");
            System.out.println(text + "------相关度得分"+score+"------"+ hits[i].toString()+"\n内容："+text2);
            
        }
        indexSearch.close();
        
        // end time
        long end = new Date().getTime();
        
        System.out.println("Found " + collector.getTotalHits()
                + " document(s) (in " + (end - start)
                + " milliseconds) that matched query '" + q + "':");
        
    }

    /**
     * 查詢
     * 
     * @param indexDir
     *        索引目录地址
     * @param q
     *         要查询的字符串
     * @throws Exception
     */
    public static void search(File indexDir, String q) throws Exception {
        
        //创建 IndexSearcher对象，相比IndexWriter对象，这个参数就要提供一个索引的目录就行了 
        IndexSearcher indexSearch = new IndexSearcher(FSDirectory.open(indexDir), true);// read-only
        //在建立索引时,存在IndexWriter对象中的
        /**
         *  创建QueryParser对象,
         *  第一个参数表示Lucene的版本,
         *  第二个表示搜索Field的字段,
         *  第三个表示搜索使用分词器 
         */
        QueryParser parser = new QueryParser(Version.LUCENE_30, "contents",new StandardAnalyzer(Version.LUCENE_30));
        TopScoreDocCollector collector = TopScoreDocCollector.create(TOP_NUM,false);
        // start time
        long start = new Date().getTime();
        Query query = parser.parse(q);
        indexSearch.search(query,collector);
      //---------------------高亮显示---------------------------------------
        /**
         * Formatter:设置高亮器的格式,无参构造函数表示使用<b>标签
         * Scorer:Highlighter需要知道哪些关键词是需要高亮的,需要需要查询条件
         */
        Formatter formatter = new SimpleHTMLFormatter();
        QueryScorer scorer = new QueryScorer(query);
        Highlighter highlighter = new Highlighter(formatter,scorer);
        //除了上面两个以外,还需要生成一段摘要,以便搜索的时候显示,指定摘要的大小为20个字符
        Fragmenter fragmenter = new SimpleFragmenter(20);
        highlighter.setTextFragmenter(fragmenter);
        //------------------------------------------------------------
        //搜索结果TopScoreDocCollector里面有 TopDocs,TopDocs里面有scoreDocs[]数组，里面保存着索引值.   
        ScoreDoc[] hits = collector.topDocs().scoreDocs;
        
        System.out.println("找到了"+hits.length+"个");
        
        //循环ScoreDoc数据，并使用indexSearch.doc方法把Document还原，再拿出对应的字段的值  
        for (int i = 0; i < hits.length; i++) {
            ScoreDoc scoreDoc = hits[i];
            //浮点类型的得分
            float score = scoreDoc.score;
            // new method is.doc()
            Document doc = indexSearch.doc(hits[i].doc);
            highlighter.setTextFragmenter(new SimpleFragmenter(200)); 
            String text=highlighter.getBestFragment(new StandardAnalyzer(Version.LUCENE_30),"filename", doc.get("filename")); 
            if(text==null)text=doc.get("filename");
            System.out.println(text+ "------相关度得分"+score+"------"+ hits[i].toString()+"\n内容："+doc.get("content"));
        }
        indexSearch.close();
        
        // end time
        long end = new Date().getTime();
        
        System.out.println("Found " + collector.getTotalHits()
                + " document(s) (in " + (end - start)
                + " milliseconds) that matched query '" + q + "':");
        
    }
}

对数据库创建索引及查询

建立数据库索引 lucene3.6

package com.gjw.DB;

import java.io.File;
import java.io.IOException;
import java.sql.ResultSet;
import java.sql.SQLException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;



/**********************
*
* 建立数据库索引 lucene3.6+
* 
*/

public class DataBaseIndexer{
public static void main(String[] args) throws IOException,SQLException{
        String indexDir = "e:\\index2";
        DBConn conn = new DBConn();
        conn.OpenConnection();

        ResultSet rs = conn.ExecuteQuery("select * from CEC_VENDOR");
        // 为表字段建立索引 
        Directory dir = new SimpleFSDirectory(new File(indexDir)); 

        IndexWriter writer = new IndexWriter(dir,
        new StandardAnalyzer(Version.LUCENE_30), true,
        IndexWriter.MaxFieldLength.UNLIMITED);


        while (rs.next()) { 
            //System.out.println(rs.getString("vendor_name")); 
            Document doc = new Document(); 
            doc.add(new Field("vendor_name", rs.getString("vendor_name"),Field.Store.YES, Field.Index.ANALYZED)); 
            doc.add(new Field("vendor_address", rs.getString("vendor_address"),Field.Store.YES, Field.Index.ANALYZED)); 
            doc.add(new Field("vendor_id", rs.getString("vendor_id"),Field.Store.YES, Field.Index.ANALYZED));
            //doc.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),Field.Store.YES,Field.Index.NOT_ANALYZED)); 
            writer.addDocument(doc); 
        }
        System.out.println("numDocs"+writer.numDocs()); 
        writer.optimize();
        writer.commit();
        writer.close(); 
    }
}

搜索索引 Lucene 3.0(第二步)

package com.gjw.DB;

import java.io.File;
import java.util.Date;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/**
 * 搜索索引 Lucene 3.0(第二步)
 * 
 * @author RenWeigang
 * 
 * @version 2010.12.13
 * 
 */
public class Searcher {
    
    //保存索引的地方
    private static String INDEX_DIR = "E:\\index2";
    
    private static String KEYWORD = "搜索关键字";
    private static int TOP_NUM = 100;

    public static void main(String[] args) throws Exception {
        
        File indexDir = new File(INDEX_DIR);
        if (!indexDir.exists() || !indexDir.isDirectory()) {
            throw new Exception(indexDir
                    + " does not exist or is not a directory.");
        }

        searchs(indexDir, KEYWORD);
    }
    
    /**
     * 多查詢
     * 
     * @param indexDir
     *        索引目录地址
     * @param q
     *         要查询的字符串
     * @throws Exception
     */
    public static void searchs(File indexDir, String q) throws Exception {
        
        //创建 IndexSearcher对象，相比IndexWriter对象，这个参数就要提供一个索引的目录就行了 
        IndexSearcher indexSearch = new IndexSearcher(FSDirectory.open(indexDir), true);// read-only
        //在建立索引时,存在IndexWriter对象中的
        /**
         *  创建QueryParser对象,
         *  第一个参数表示Lucene的版本,
         *  第二个表示搜索Field的字段,
         *  第三个表示搜索使用分词器 
         */
        String[] fields = {"vendor_name","vendor_id","vendor_address"};
        MultiFieldQueryParser parser = new MultiFieldQueryParser(Version.LUCENE_30, fields, new StandardAnalyzer(Version.LUCENE_30));
        //生成Query对象   
        Query query = parser.parse(q);
      //---------------------高亮显示---------------------------------------
        /**
         * Formatter:设置高亮器的格式,无参构造函数表示使用<b>标签
         * Scorer:Highlighter需要知道哪些关键词是需要高亮的,需要需要查询条件
         */
        Formatter formatter = new SimpleHTMLFormatter("<font color='red'>","</font>");
        QueryScorer scorer = new QueryScorer(query);
        Highlighter highlighter = new Highlighter(formatter,scorer);
        //除了上面两个以外,还需要生成一段摘要,以便搜索的时候显示,指定摘要的大小为20个字符
        Fragmenter fragmenter = new SimpleFragmenter(20);
        highlighter.setTextFragmenter(fragmenter);
        //------------------------------------------------------------
        TopScoreDocCollector collector = TopScoreDocCollector.create(TOP_NUM,false);

        // start time
        long start = new Date().getTime();

        indexSearch.search(query,collector);
        //搜索结果TopScoreDocCollector里面有 TopDocs,TopDocs里面有scoreDocs[]数组，里面保存着索引值.   
        ScoreDoc[] hits = collector.topDocs().scoreDocs;
        
        System.out.println("找到了"+hits.length+"个");
        
        //循环ScoreDoc数据，并使用indexSearch.doc方法把Document还原，再拿出对应的字段的值  
        for (int i = 0; i < hits.length; i++) {
            ScoreDoc scoreDoc = hits[i];
            //浮点类型的得分
            float score = scoreDoc.score;
            // new method is.doc()
            Document doc = indexSearch.doc(hits[i].doc);
            String text=highlighter.getBestFragment(new StandardAnalyzer(Version.LUCENE_30),"vendor_name", doc.get("vendor_name"));
            String text2=highlighter.getBestFragment(new StandardAnalyzer(Version.LUCENE_30),"vendor_id", doc.get("vendor_id")); 
            if(text==null)text=doc.get("vendor_name");
            if(text2==null)text2=doc.get("vendor_id");
            System.out.println(text + "------相关度得分"+score+"------"+ hits[i].toString()+"\n内容："+text2);
            
        }
        indexSearch.close();
        
        // end time
        long end = new Date().getTime();
        
        System.out.println("Found " + collector.getTotalHits()
                + " document(s) (in " + (end - start)
                + " milliseconds) that matched query '" + q + "':");
        
    }

    /**
     * 查詢
     * 
     * @param indexDir
     *        索引目录地址
     * @param q
     *         要查询的字符串
     * @throws Exception
     */
    public static void search(File indexDir, String q) throws Exception {
        
        //创建 IndexSearcher对象，相比IndexWriter对象，这个参数就要提供一个索引的目录就行了 
        IndexSearcher indexSearch = new IndexSearcher(FSDirectory.open(indexDir), true);// read-only
        //在建立索引时,存在IndexWriter对象中的
        /**
         *  创建QueryParser对象,
         *  第一个参数表示Lucene的版本,
         *  第二个表示搜索Field的字段,
         *  第三个表示搜索使用分词器 
         */
        QueryParser parser = new QueryParser(Version.LUCENE_30, "contents",new StandardAnalyzer(Version.LUCENE_30));
        TopScoreDocCollector collector = TopScoreDocCollector.create(TOP_NUM,false);
        // start time
        long start = new Date().getTime();
        Query query = parser.parse(q);
        indexSearch.search(query,collector);
      //---------------------高亮显示---------------------------------------
        /**
         * Formatter:设置高亮器的格式,无参构造函数表示使用<b>标签
         * Scorer:Highlighter需要知道哪些关键词是需要高亮的,需要需要查询条件
         */
        Formatter formatter = new SimpleHTMLFormatter();
        QueryScorer scorer = new QueryScorer(query);
        Highlighter highlighter = new Highlighter(formatter,scorer);
        //除了上面两个以外,还需要生成一段摘要,以便搜索的时候显示,指定摘要的大小为20个字符
        Fragmenter fragmenter = new SimpleFragmenter(20);
        highlighter.setTextFragmenter(fragmenter);
        //------------------------------------------------------------
        //搜索结果TopScoreDocCollector里面有 TopDocs,TopDocs里面有scoreDocs[]数组，里面保存着索引值.   
        ScoreDoc[] hits = collector.topDocs().scoreDocs;
        
        System.out.println("找到了"+hits.length+"个");
        
        //循环ScoreDoc数据，并使用indexSearch.doc方法把Document还原，再拿出对应的字段的值  
        for (int i = 0; i < hits.length; i++) {
            ScoreDoc scoreDoc = hits[i];
            //浮点类型的得分
            float score = scoreDoc.score;
            // new method is.doc()
            Document doc = indexSearch.doc(hits[i].doc);
            highlighter.setTextFragmenter(new SimpleFragmenter(200)); 
            String text=highlighter.getBestFragment(new StandardAnalyzer(Version.LUCENE_30),"filename", doc.get("filename")); 
            if(text==null)text=doc.get("filename");
            System.out.println(text+ "------相关度得分"+score+"------"+ hits[i].toString()+"\n内容："+doc.get("content"));
        }
        indexSearch.close();
        
        // end time
        long end = new Date().getTime();
        
        System.out.println("Found " + collector.getTotalHits()
                + " document(s) (in " + (end - start)
                + " milliseconds) that matched query '" + q + "':");
        
    }
}

对文件索引的查询，和对数据库索引的查询是一样的
下面看看怎么删除指定索引
Lucene 3.0+ 删除索引

package com.gjw.lecence;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
/**
 * Lucene 3.0+ 删除索引
 * @author Administrator
 *
 */

public class DeleteIndex {

    public static void main(String[] args) throws CorruptIndexException, IOException {
        //索引所放目录
        String indexDir = "E:\\index";
        //创建Directory
        //indexWriter的第三个参数true的创建索引或覆盖现有的一个;false添加到现有的指数
        Directory dir =  new SimpleFSDirectory(new File(indexDir));     
        IndexWriter indexWriter = new IndexWriter(dir,new StandardAnalyzer(Version.LUCENE_30),false,IndexWriter.MaxFieldLength.UNLIMITED);
        /*
         * 删除filename为time.txt的Document
         *Term类也是用来搜索的,构造函数的意思是:在哪个Field里面查哪个关键词
         *然后调用IndexWriter的deleteDocument()方法删除包含指定Term的Document 
         */ 
        Term term = new Term("filename","6667");
        indexWriter.deleteDocuments(term);
        //优化
        //indexWriter.optimize();
        //提交事务
        indexWriter.commit();
        System.out.println("是否有删除="+indexWriter.hasDeletions());
        //如果不indexWriter.optimize()以下两个会有区别
        System.out.println("一共有"+indexWriter.maxDoc()+"索引");
        System.out.println("还剩"+indexWriter.numDocs()+"索引");
        indexWriter.close();
    }

}

先撒泡尿，做个记号

Lucene 3.0.1 全文检目录擎的架构 对文件，数据库建索引，及查询(高亮显示)

相关推荐

Lucene 3.0.1 全文检目录擎的架构对文件，数据库建索引，及查询(高亮显示)