使用Lucene实现全文检索,主要有下面三个步骤:
1、建立索引库:根据网站新闻信息库中的已有的数据资料建立Lucene索引文件。
2、通过索引库搜索:有了索引后,即可使用标准的词法分析器或直接的词法分析器实现进行全文检索。
3、维护索引库:网站新闻信息库中的信息会不断的变动,包括新增、修改及删除等,这些信息的变动都需要进一步反映到Lucene索引文件中。
下面是myrss.easyjf.com相关代码!
一、索引管理(建立及维护)
索引管理类MyRssIndexManage主要实现根据网站信息库中的数据建立索引,维护索引等。由于索引的过程需要消耗一定的时间,因此,索引管理类实现Runnable接口,使得我们可以在程序中开新线程来运行。
1package com.easyjf.lucene;
2import java.util.Date;
3import java.util.List;
4import org.apache.lucene.analysis.standard.StandardAnalyzer;
5import org.apache.lucene.document.Document;
6import org.apache.lucene.document.Field;
7import org.apache.lucene.index.IndexReader;
8import org.apache.lucene.index.IndexWriter;
9import org.apache.lucene.queryParser.MultiFieldQueryParser;
10import org.apache.lucene.queryParser.QueryParser;
11import org.apache.lucene.search.Hits;
12import org.apache.lucene.search.IndexSearcher;
13import org.apache.lucene.search.Query;
14import org.apache.lucene.search.Searcher;
15import com.easyjf.dbo.EasyJDB;
16import com.easyjf.news.business.NewsDir;
17import com.easyjf.news.business.NewsDoc;
18import com.easyjf.news.business.NewsUtil;
19import com.easyjf.web.tools.IPageList;
20public class MyRssIndexManage implements Runnable {
21 private String indexDir;
22 private String indexType="add";
23 public void run() {
24 // TODO Auto-generated method stub
25 if("add".equals(indexType))
26 normalIndex();
27 else if ("init".equals(indexType)) reIndexAll();
28 }
29 public void normalIndex()
30 {
31 try{
32 Date start = new Date();
33 int num=0;
34 IndexWriter writer=new IndexWriter(indexDir,new StandardAnalyzer(),false);
35 //NewsDir dir=NewsDir.readBySn();
36 String scope="(needIndex<2) or(needIndex is null)";
37 IPageList pList=NewsUtil.pageList(scope,1,50);
38 for(int p=0;p {
39 pList=NewsUtil.pageList(scope,p,100);
40 List list=pList.getResult();
41 for(int i=0;i {
42 NewsDoc doc=(NewsDoc)list.get(i);
43 writer.addDocument(newsdoc2lucenedoc(doc));
44 num++;
45 }
46 }
47 writer.optimize();
48 writer.close();
49 EasyJDB.getInstance().execute("update NewsDoc set needIndex=2 where "+scope);
50 Date end = new Date();
51 System.out.print("新增索引"+num+"条信息,一共花:"+(end.getTime() - start.getTime())/60000+"分钟!");
52 }
53 catch(Exception e)
54 {
55 e.printStackTrace();
56 }
57 }
58 public void reIndexAll()
59 {
60 try{
61 Date start = new Date();
62 int num=0;
63 IndexWriter writer=new IndexWriter(indexDir,new StandardAnalyzer(),true);
64 NewsDir dir=NewsDir.readBySn("easyjf");
65 IPageList pList=NewsUtil.pageList(dir,1,50);
66 for(int p=0;p {
67 pList=NewsUtil.pageList(dir,p,100);
68 List list=pList.getResult();
69 for(int i=0;i {
70 NewsDoc doc=(NewsDoc)list.get(i);
71 writer.addDocument(newsdoc2lucenedoc(doc));
72 num++;
73 }
74 }
75 writer.optimize();
76 writer.close();
77 EasyJDB.getInstance().execute("update NewsDoc set needIndex=2 where dirPath like 'easyjf%'");
78 Date end = new Date();
79 System.out.print("全部重新做了一次索引,一共处理了"+num+"条信息,花:"+(end.getTime() - start.getTime())/60000+"分钟!");
80 }
81 catch(Exception e)
82 {
83 e.printStackTrace();
84 }
85 }
86 private Document newsdoc2lucenedoc(NewsDoc doc)
87 {
88 Document lDoc=new Document();
89 lDoc.add(new Field("title",doc.getTitle(),Field.Store.YES,Field.Index.TOKENIZED));
90 lDoc.add(new Field("content",doc.getContent(),Field.Store.YES,Field.Index.TOKENIZED));
91 lDoc.add(new Field("url",doc.getRemark(),Field.Store.YES,Field.Index.NO));
92 lDoc.add(new Field("cid",doc.getCid(),Field.Store.YES,Field.Index.NO));
93 lDoc.add(new Field("source",doc.getSource(),Field.Store.YES,Field.Index.NO));
94 lDoc.add(new Field("inputTime",doc.getInputTime().toString(),Field.Store.YES,Field.Index.NO));
95 return lDoc;
96 }
97 public String getIndexDir() {
98 return indexDir;
99 }
100 public void setIndexDir(String indexDir) {
101 this.indexDir = indexDir;
102 }
103
104 public String getIndexType() {
105 return indexType;
106 }
107 public void setIndexType(String indexType) {
108 this.indexType = indexType;
109 }
110}
111
二、使用Lucene实现全文搜索
下面是MyRssSearch类的源码,该类主要实现使用Lucene中Searcher及QueryParser实现从索引库中搜索关键词。
1package com.easyjf.lucene;
2
3import java.util.List;
4import org.apache.lucene.analysis.standard.StandardAnalyzer;
5import org.apache.lucene.document.Document;
6import org.apache.lucene.index.IndexReader;
7import org.apache.lucene.queryParser.MultiFieldQueryParser;
8import org.apache.lucene.queryParser.QueryParser;
9import org.apache.lucene.search.Hits;
10import org.apache.lucene.search.IndexSearcher;
11import org.apache.lucene.search.Query;
12import org.apache.lucene.search.Searcher;
13
14import com.easyjf.search.MyRssUtil;
15import com.easyjf.search.SearchContent;
16import com.easyjf.web.tools.IPageList;
17import com.easyjf.web.tools.PageList;
18
19public class MyRssSearch {
20 private String indexDir;
21 IndexReader ir;
22 Searcher search;
23 public IPageList search(String key,int pageSize,int currentPage)
24 {
25 IPageList pList=new PageList(new HitsQuery(doSearch(key)));
26 pList.doList(pageSize,currentPage,"","",null);
27 if(pList!=null)
28 {
29 List list=pList.getResult();
30 if(list!=null){
31 for(int i=0;i {
32 list.set(i,lucene2searchObj((Document)list.get(i),key));
33 }
34 }
35 }
36 try{
37 if(search!=null)search.close();
38 if(ir!=null)ir.close();
39 }
40 catch(Exception e)
41 {
42 e.printStackTrace();
43 }
44 return pList;
45 }
46 private SearchContent lucene2searchObj(Document doc,String key)
47 {
48 SearchContent searchObj=new SearchContent();
49 String title=doc.getField("title").stringValue();
50 searchObj.setTitle(title.replaceAll(key,""+key+""));
51 searchObj.setTvalue(doc.getField("cid").stringValue());
52 searchObj.setUrl(doc.getField("url").stringValue());
53 searchObj.setSource(doc.getField("source").stringValue());
54 searchObj.setLastUpdated(doc.getField("inputTime").stringValue());
55 searchObj.setIntro(MyRssUtil.content2intro(doc.getField("content").stringValue(),key));
56 return searchObj;
57 }
58 public Hits doSearch(String key)
59 {
60 Hits hits=null;
61 try{
62 ir=IndexReader.open(indexDir);
63 search=new IndexSearcher(ir);
64 String fields[]={"title","content"};
65 QueryParser parser=new MultiFieldQueryParser(fields,new StandardAnalyzer());
66 Query query=parser.parse(key);
67 hits=search.search(query);
68 }
69 catch(Exception e)
70 {
71 e.printStackTrace();
72 }
73 //System.out.println("搜索结果:"+hits.length());
74 return hits;
75 }
76
77 public String getIndexDir() {
78 return indexDir;
79 }
80 public void setIndexDir(String indexDir) {
81 this.indexDir = indexDir;
82 }
83}
84
在上面的代码中,search方法返回一个封装了分页查询结果的IPageList,IPageList是EasyJWeb Tools业务引擎中的分页引擎,对于IPageList的使用,请看本人写的这篇文章
《EasyJWeb Tools中业务引擎分页的设计实现》:
我们针对Lucene的的查询结果Hits结构,写了一个查询器HitsQuery。代码如下所示:
1package com.easyjf.lucene;
2import java.util.ArrayList;
3import java.util.Collection;
4import java.util.List;
5import org.apache.lucene.search.Hits;
6import com.easyjf.web.tools.IQuery;
7public class HitsQuery implements IQuery {
8 private int begin=0;
9 private int max=0;
10 private Hits hits;
11 public HitsQuery()
12 {
13
14 }
15 public HitsQuery(Hits hits)
16 {
17 if(hits!=null)
18 {
19 this.hits=hits;
20 this.max=hits.length();
21 }
22 }
23 public int getRows(String arg0) {
24 // TODO Auto-generated method stub
25 return (hits==null?0:hits.length());
26 }
27 public List getResult(String arg0) {
28 // TODO Auto-generated method stub
29 List list=new ArrayList();
30 for(int i=begin;i<(begin+max)&&(i {
31 try{
32 list.add(hits.doc(i));
33 }
34 catch(Exception e)
35 {
36 e.printStackTrace();
37 }
38 }
39 return list;
40 }
41 public void setFirstResult(int begin) {
42 // TODO Auto-generated method stub
43 this.begin=begin;
44 }
45 public void setMaxResults(int max) {
46 // TODO Auto-generated method stub
47 this.max=max;
48 }
49 public void setParaValues(Collection arg0) {
50 // TODO Auto-generated method stub
51
52 }
53 public List getResult(String condition, int begin, int max) {
54 // TODO Auto-generated method stub
55 if((begin>=0)&&(begin if(!(max>hits.length()))this.max=max;
56 return getResult(condition);
57 }
58}
59
三、Web调用
下面我们来看看在Web中如果调用商业逻辑层的全文检索功能。下面是处理用户请请的Action中关于搜索部分的源码:
1package com.easyjf.news.action;
2public class SearchAction implements IWebAction {
3public Page doSearch(WebForm form,Module module)throws Exception
4{
5 String key=CommUtil.null2String(form.get("v"));
6 key=URLDecoder.decode(URLEncoder.encode(key,"ISO8859_1"),"utf-8");
7 form.set("v",key);
8 form.addResult("v2",URLEncoder.encode(key,"utf-8"));
9 if(key.getBytes().length>2){
10 String orderBy=CommUtil.null2String(form.get("order"));
11 int currentPage=CommUtil.null2Int(form.get("page"));
12 int pageSize=CommUtil.null2Int(form.get("pageSize"));
13 if(currentPage<1)currentPage=1;
14 if(pageSize<1)pageSize=15;
15 SearchEngine search=new SearchEngine(key,orderBy,pageSize,currentPage);
16 search.getLuceneSearch().setIndexDir(Globals.APP_BASE_DIR+"/WEB-INF/index");
17 search.doSearchByLucene();
18 IPageList pList=search.getResult();
19 if(pList!=null && pList.getRowCount()>0){
20 form.addResult("list",pList.getResult());
21 form.addResult("pages",new Integer(pList.getPages()));
22 form.addResult("rows",new Integer(pList.getRowCount()));
23 form.addResult("page",new Integer(pList.getCurrentPage()));
24 form.addResult("gotoPageHTML",CommUtil.showPageHtml(pList.getCurrentPage(),pList.getPages()));
25 }
26 else
27 {
28 form.addResult("notFound","true");//找不到数据
29 }
30 }
31 else
32 form.addResult("errMsg","您输入的关键字太短!");
33 form.addResult("hotSearch",SearchEngine.getHotSearch(20));
34 return null;
35}
36}
37其中调用的SearchEngine类中有关Lucene部分的源码:
38public class SearchEngine {
39private MyRssSearch luceneSearch=new MyRssSearch();
40public void doSearchByLucene()
41{
42 SearchKey keyObj=readCache();
43 if(keyObj!=null){
44 result=luceneSearch.search(key,pageSize,currentPage);
45 if(updateStatus){
46 keyObj.setReadTimes(new Integer(keyObj.getReadTimes().intValue()+1));
47 keyObj.update();
48 }
49 }
50 else//缓存中没有该关键字信息,生成关键字搜索结果
51 {
52 keyObj=new SearchKey();
53 keyObj.setTitle(key);
54 keyObj.setLastUpdated(new Date());
55 keyObj.setReadTimes(new Integer(1));
56 keyObj.setStatus(new Integer(0));
57 keyObj.setSequence(new Integer(1));
58 keyObj.setVdate(new Date());
59 keyObj.save();
60 result=luceneSearch.search(key,pageSize,currentPage);;
61
62 }
63}
64}
65
本文转自:http://java.ccidnet.com/art/3749/20060704/595099_1.html
---------------------------------------------------------------------------------------------------------------------------------
说人之短,乃护己之短。夸己之长,乃忌人之长。皆由存心不厚,识量太狭耳。能去此弊,可以进德,可以远怨。
http://www.blogjava.net/szhswl
------------------------------------------------------------------------------------------------------ ----------------- ---------
posted on 2007-12-05 17:08
宋针还 阅读(391)
评论(0) 编辑 收藏 所属分类:
搜索引擎