使用Lucene实现全文检索,主要有下面三个步骤:
1、建立索引库:根据网站新闻信息库中的已有的数据资料建立Lucene索引文件。
2、通过索引库搜索:有了索引后,即可使用标准的词法分析器或直接的词法分析器实现进行全文检索。
3、维护索引库:网站新闻信息库中的信息会不断的变动,包括新增、修改及删除等,这些信息的变动都需要进一步反映到Lucene索引文件中。
下面是myrss.easyjf.com相关代码!
一、索引管理(建立及维护)
索引管理类MyRssIndexManage主要实现根据网站信息库中的数据建立索引,维护索引等。由于索引的过程需要消耗一定的时间,因此,索引管理类实现Runnable接口,使得我们可以在程序中开新线程来运行。
1
package com.easyjf.lucene;
2
import java.util.Date;
3
import java.util.List;
4
import org.apache.lucene.analysis.standard.StandardAnalyzer;
5
import org.apache.lucene.document.Document;
6
import org.apache.lucene.document.Field;
7
import org.apache.lucene.index.IndexReader;
8
import org.apache.lucene.index.IndexWriter;
9
import org.apache.lucene.queryParser.MultiFieldQueryParser;
10
import org.apache.lucene.queryParser.QueryParser;
11
import org.apache.lucene.search.Hits;
12
import org.apache.lucene.search.IndexSearcher;
13
import org.apache.lucene.search.Query;
14
import org.apache.lucene.search.Searcher;
15
import com.easyjf.dbo.EasyJDB;
16
import com.easyjf.news.business.NewsDir;
17
import com.easyjf.news.business.NewsDoc;
18
import com.easyjf.news.business.NewsUtil;
19
import com.easyjf.web.tools.IPageList;
20
public class MyRssIndexManage implements Runnable
{
21
private String indexDir;
22
private String indexType="add";
23
public void run()
{
24
// TODO Auto-generated method stub
25
if("add".equals(indexType))
26
normalIndex();
27
else if ("init".equals(indexType)) reIndexAll();
28
}
29
public void normalIndex()
30
{
31
try
{
32
Date start = new Date();
33
int num=0;
34
IndexWriter writer=new IndexWriter(indexDir,new StandardAnalyzer(),false);
35
//NewsDir dir=NewsDir.readBySn();
36
String scope="(needIndex<2) or(needIndex is null)";
37
IPageList pList=NewsUtil.pageList(scope,1,50);
38
for(int p=0;p
{
39
pList=NewsUtil.pageList(scope,p,100);
40
List list=pList.getResult();
41
for(int i=0;i
{
42
NewsDoc doc=(NewsDoc)list.get(i);
43
writer.addDocument(newsdoc2lucenedoc(doc));
44
num++;
45
}
46
}
47
writer.optimize();
48
writer.close();
49
EasyJDB.getInstance().execute("update NewsDoc set needIndex=2 where "+scope);
50
Date end = new Date();
51
System.out.print("新增索引"+num+"条信息,一共花:"+(end.getTime() - start.getTime())/60000+"分钟!");
52
}
53
catch(Exception e)
54
{
55
e.printStackTrace();
56
}
57
}
58
public void reIndexAll()
59
{
60
try
{
61
Date start = new Date();
62
int num=0;
63
IndexWriter writer=new IndexWriter(indexDir,new StandardAnalyzer(),true);
64
NewsDir dir=NewsDir.readBySn("easyjf");
65
IPageList pList=NewsUtil.pageList(dir,1,50);
66
for(int p=0;p
{
67
pList=NewsUtil.pageList(dir,p,100);
68
List list=pList.getResult();
69
for(int i=0;i
{
70
NewsDoc doc=(NewsDoc)list.get(i);
71
writer.addDocument(newsdoc2lucenedoc(doc));
72
num++;
73
}
74
}
75
writer.optimize();
76
writer.close();
77
EasyJDB.getInstance().execute("update NewsDoc set needIndex=2 where dirPath like 'easyjf%'");
78
Date end = new Date();
79
System.out.print("全部重新做了一次索引,一共处理了"+num+"条信息,花:"+(end.getTime() - start.getTime())/60000+"分钟!");
80
}
81
catch(Exception e)
82
{
83
e.printStackTrace();
84
}
85
}
86
private Document newsdoc2lucenedoc(NewsDoc doc)
87
{
88
Document lDoc=new Document();
89
lDoc.add(new Field("title",doc.getTitle(),Field.Store.YES,Field.Index.TOKENIZED));
90
lDoc.add(new Field("content",doc.getContent(),Field.Store.YES,Field.Index.TOKENIZED));
91
lDoc.add(new Field("url",doc.getRemark(),Field.Store.YES,Field.Index.NO));
92
lDoc.add(new Field("cid",doc.getCid(),Field.Store.YES,Field.Index.NO));
93
lDoc.add(new Field("source",doc.getSource(),Field.Store.YES,Field.Index.NO));
94
lDoc.add(new Field("inputTime",doc.getInputTime().toString(),Field.Store.YES,Field.Index.NO));
95
return lDoc;
96
}
97
public String getIndexDir()
{
98
return indexDir;
99
}
100
public void setIndexDir(String indexDir)
{
101
this.indexDir = indexDir;
102
}
103
104
public String getIndexType()
{
105
return indexType;
106
}
107
public void setIndexType(String indexType)
{
108
this.indexType = indexType;
109
}
110
}
111
二、使用Lucene实现全文搜索
下面是MyRssSearch类的源码,该类主要实现使用Lucene中Searcher及QueryParser实现从索引库中搜索关键词。
1
package com.easyjf.lucene;
2
3
import java.util.List;
4
import org.apache.lucene.analysis.standard.StandardAnalyzer;
5
import org.apache.lucene.document.Document;
6
import org.apache.lucene.index.IndexReader;
7
import org.apache.lucene.queryParser.MultiFieldQueryParser;
8
import org.apache.lucene.queryParser.QueryParser;
9
import org.apache.lucene.search.Hits;
10
import org.apache.lucene.search.IndexSearcher;
11
import org.apache.lucene.search.Query;
12
import org.apache.lucene.search.Searcher;
13
14
import com.easyjf.search.MyRssUtil;
15
import com.easyjf.search.SearchContent;
16
import com.easyjf.web.tools.IPageList;
17
import com.easyjf.web.tools.PageList;
18
19
public class MyRssSearch
{
20
private String indexDir;
21
IndexReader ir;
22
Searcher search;
23
public IPageList search(String key,int pageSize,int currentPage)
24
{
25
IPageList pList=new PageList(new HitsQuery(doSearch(key)));
26
pList.doList(pageSize,currentPage,"","",null);
27
if(pList!=null)
28
{
29
List list=pList.getResult();
30
if(list!=null)
{
31
for(int i=0;i
{
32
list.set(i,lucene2searchObj((Document)list.get(i),key));
33
}
34
}
35
}
36
try
{
37
if(search!=null)search.close();
38
if(ir!=null)ir.close();
39
}
40
catch(Exception e)
41
{
42
e.printStackTrace();
43
}
44
return pList;
45
}
46
private SearchContent lucene2searchObj(Document doc,String key)
47
{
48
SearchContent searchObj=new SearchContent();
49
String title=doc.getField("title").stringValue();
50
searchObj.setTitle(title.replaceAll(key,""+key+""));
51
searchObj.setTvalue(doc.getField("cid").stringValue());
52
searchObj.setUrl(doc.getField("url").stringValue());
53
searchObj.setSource(doc.getField("source").stringValue());
54
searchObj.setLastUpdated(doc.getField("inputTime").stringValue());
55
searchObj.setIntro(MyRssUtil.content2intro(doc.getField("content").stringValue(),key));
56
return searchObj;
57
}
58
public Hits doSearch(String key)
59
{
60
Hits hits=null;
61
try
{
62
ir=IndexReader.open(indexDir);
63
search=new IndexSearcher(ir);
64
String fields[]=
{"title","content"};
65
QueryParser parser=new MultiFieldQueryParser(fields,new StandardAnalyzer());
66
Query query=parser.parse(key);
67
hits=search.search(query);
68
}
69
catch(Exception e)
70
{
71
e.printStackTrace();
72
}
73
//System.out.println("搜索结果:"+hits.length());
74
return hits;
75
}
76
77
public String getIndexDir()
{
78
return indexDir;
79
}
80
public void setIndexDir(String indexDir)
{
81
this.indexDir = indexDir;
82
}
83
}
84
在上面的代码中,search方法返回一个封装了分页查询结果的IPageList,IPageList是EasyJWeb Tools业务引擎中的分页引擎,对于IPageList的使用,请看本人写的这篇文章
《EasyJWeb Tools中业务引擎分页的设计实现》:
我们针对Lucene的的查询结果Hits结构,写了一个查询器HitsQuery。代码如下所示:
1
package com.easyjf.lucene;
2
import java.util.ArrayList;
3
import java.util.Collection;
4
import java.util.List;
5
import org.apache.lucene.search.Hits;
6
import com.easyjf.web.tools.IQuery;
7
public class HitsQuery implements IQuery
{
8
private int begin=0;
9
private int max=0;
10
private Hits hits;
11
public HitsQuery()
12
{
13
14
}
15
public HitsQuery(Hits hits)
16
{
17
if(hits!=null)
18
{
19
this.hits=hits;
20
this.max=hits.length();
21
}
22
}
23
public int getRows(String arg0)
{
24
// TODO Auto-generated method stub
25
return (hits==null?0:hits.length());
26
}
27
public List getResult(String arg0)
{
28
// TODO Auto-generated method stub
29
List list=new ArrayList();
30
for(int i=begin;i<(begin+max)&&(i
{
31
try
{
32
list.add(hits.doc(i));
33
}
34
catch(Exception e)
35
{
36
e.printStackTrace();
37
}
38
}
39
return list;
40
}
41
public void setFirstResult(int begin)
{
42
// TODO Auto-generated method stub
43
this.begin=begin;
44
}
45
public void setMaxResults(int max)
{
46
// TODO Auto-generated method stub
47
this.max=max;
48
}
49
public void setParaValues(Collection arg0)
{
50
// TODO Auto-generated method stub
51
52
}
53
public List getResult(String condition, int begin, int max)
{
54
// TODO Auto-generated method stub
55
if((begin>=0)&&(begin if(!(max>hits.length()))this.max=max;
56
return getResult(condition);
57
}
58
}
59
三、Web调用
下面我们来看看在Web中如果调用商业逻辑层的全文检索功能。下面是处理用户请请的Action中关于搜索部分的源码:
1
package com.easyjf.news.action;
2
public class SearchAction implements IWebAction
{
3
public Page doSearch(WebForm form,Module module)throws Exception
4

{
5
String key=CommUtil.null2String(form.get("v"));
6
key=URLDecoder.decode(URLEncoder.encode(key,"ISO8859_1"),"utf-8");
7
form.set("v",key);
8
form.addResult("v2",URLEncoder.encode(key,"utf-8"));
9
if(key.getBytes().length>2)
{
10
String orderBy=CommUtil.null2String(form.get("order"));
11
int currentPage=CommUtil.null2Int(form.get("page"));
12
int pageSize=CommUtil.null2Int(form.get("pageSize"));
13
if(currentPage<1)currentPage=1;
14
if(pageSize<1)pageSize=15;
15
SearchEngine search=new SearchEngine(key,orderBy,pageSize,currentPage);
16
search.getLuceneSearch().setIndexDir(Globals.APP_BASE_DIR+"/WEB-INF/index");
17
search.doSearchByLucene();
18
IPageList pList=search.getResult();
19
if(pList!=null && pList.getRowCount()>0)
{
20
form.addResult("list",pList.getResult());
21
form.addResult("pages",new Integer(pList.getPages()));
22
form.addResult("rows",new Integer(pList.getRowCount()));
23
form.addResult("page",new Integer(pList.getCurrentPage()));
24
form.addResult("gotoPageHTML",CommUtil.showPageHtml(pList.getCurrentPage(),pList.getPages()));
25
}
26
else
27
{
28
form.addResult("notFound","true");//找不到数据
29
}
30
}
31
else
32
form.addResult("errMsg","您输入的关键字太短!");
33
form.addResult("hotSearch",SearchEngine.getHotSearch(20));
34
return null;
35
}
36
}
37
其中调用的SearchEngine类中有关Lucene部分的源码:
38
public class SearchEngine
{
39
private MyRssSearch luceneSearch=new MyRssSearch();
40
public void doSearchByLucene()
41

{
42
SearchKey keyObj=readCache();
43
if(keyObj!=null)
{
44
result=luceneSearch.search(key,pageSize,currentPage);
45
if(updateStatus)
{
46
keyObj.setReadTimes(new Integer(keyObj.getReadTimes().intValue()+1));
47
keyObj.update();
48
}
49
}
50
else//缓存中没有该关键字信息,生成关键字搜索结果
51
{
52
keyObj=new SearchKey();
53
keyObj.setTitle(key);
54
keyObj.setLastUpdated(new Date());
55
keyObj.setReadTimes(new Integer(1));
56
keyObj.setStatus(new Integer(0));
57
keyObj.setSequence(new Integer(1));
58
keyObj.setVdate(new Date());
59
keyObj.save();
60
result=luceneSearch.search(key,pageSize,currentPage);;
61
62
}
63
}
64
}
65
本文转自:http://java.ccidnet.com/art/3749/20060704/595099_1.html
---------------------------------------------------------------------------------------------------------------------------------
说人之短,乃护己之短。夸己之长,乃忌人之长。皆由存心不厚,识量太狭耳。能去此弊,可以进德,可以远怨。
http://www.blogjava.net/szhswl
------------------------------------------------------------------------------------------------------ ----------------- ---------
posted on 2007-12-05 17:08
宋针还 阅读(395)
评论(0) 编辑 收藏 所属分类:
搜索引擎