posts - 431,  comments - 344,  trackbacks - 0

HTML 解析器
package com.rain.util;

import Java.io.FileInputStream;
import Java.io.FileNotFoundException;
import Java.io.IOException;
import Java.io.InputStream;
import Java.io.InputStreamReader;
import Java.io.Reader;
import Java.io.UnsupportedEncodingException;

import org.apache.lucene.demo.html.HTMLParser;

public class HTMLDocParser {

 private String htmlPath;
 private HTMLParser htmlParser;
 
 public HTMLDocParser(String htmlPath){
  this.htmlPath=htmlPath;
  initHtmlParser();
 }
 public void initHtmlParser(){
  InputStream inputStream=null;
  try{
   inputStream=new FileInputStream(htmlPath);
  }catch(FileNotFoundException e){
   e.printStackTrace();
  }
  if(null!=inputStream){
   try{
    htmlParser=new HTMLParser(new InputStreamReader(inputStream,"utf-8"));
   }catch(UnsupportedEncodingException e){
    e.printStackTrace();
   }
  }
 }
 public String getTitle(){
  if(null!=htmlParser){
   try{
    return htmlParser.getTitle();
   }catch(IOException e){
    e.printStackTrace();
   }catch(InterruptedException e){
    e.printStackTrace();
   }
  }
  return "";
 }
 public Reader getContent(){
  if(null!=htmlParser){
   try{
    return htmlParser.getReader();
   }catch(IOException e){
    e.printStackTrace();
   }
  }
  return null;
 }
 public String getPath(){
  return this.htmlPath;
 }
}


描述搜索结果的结构实体Bean
package com.rain.search;

public class SearchResultBean {
    private String htmlPath;
   
    private String htmlTitle;

 public String getHtmlPath() {
  return htmlPath;
 }

 public void setHtmlPath(String htmlPath) {
  this.htmlPath = htmlPath;
 }

 public String getHtmlTitle() {
  return htmlTitle;
 }

 public void setHtmlTitle(String htmlTitle) {
  this.htmlTitle = htmlTitle;
 }
}


索引子系统的实现

package com.rain.index;

import Java.io.File;
import Java.io.IOException;
import Java.io.Reader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.document.Field;

import com.rain.util.HTMLDocParser;

public class IndexManager {
 
 //the directory that stores HTML files
 private final String dataDir="E:\\dataDir";
 
 //the directory that is used to store a Lucene index
 private final String indexDir="E:\\indexDir";
 
 public boolean creatIndex()throws IOException{
  if(true==inIndexExist()){
   return true;
  }
  File dir=new File(dataDir);
  if(!dir.exists()){
   return false;
  }
  File[] htmls=dir.listFiles();
  Directory fsDirectory=FSDirectory.getDirectory(indexDir,true);
  Analyzer analyzer=new StandardAnalyzer();
  IndexWriter indexWriter=new IndexWriter(fsDirectory,analyzer,true);
  for(int i=0;i<htmls.length;i++){
   String htmlPath=htmls[i].getAbsolutePath();
   if(htmlPath.endsWith(".html")||htmlPath.endsWith("htm")){
    addDocument(htmlPath,indexWriter);
   }
  }
  indexWriter.optimize();
  indexWriter.close();
  return true;
 }
 
 public void addDocument(String htmlPath,IndexWriter indexWriter){
  HTMLDocParser htmlParser=new HTMLDocParser(htmlPath);
  String path=htmlParser.getPath();
  String title=htmlParser.getTitle();
  Reader content=htmlParser.getContent();
  
  Document document=new Document();
  document.add(new Field("path",path,Field.Store.YES,Field.Index.NO));
  document.add(new Field("title",title,Field.Store.YES,Field.Index.TOKENIZED));
     document.add(new Field("content",content));
     try{
      indexWriter.addDocument(document);
     }catch(IOException e){
      e.printStackTrace();
     }
 }
 public String getDataDir(){
  return this.dataDir;
 }
 
 public String getIndexDir(){
  return this.indexDir;
 }
 
 public boolean inIndexExist(){
  File directory=new File(indexDir);
  if(0<directory.listFiles().length){
   return true;
  }else{
   return false;
  }
 }
}


搜索功能的实现
package com.rain.search;

import Java.io.IOException;
import Java.util.ArrayList;
import Java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;

import com.rain.index.IndexManager;

public class SearchManager {
 private String searchWord;
 private IndexManager indexManager;
 private Analyzer analyzer;
 
 public SearchManager(String searchWord){
  this.searchWord=searchWord;
  this.indexManager=new IndexManager();
  this.analyzer=new StandardAnalyzer();
 }
 
 /**
     * do search
     */
 public List search(){
  List searchResult=new ArrayList();
  if(false==indexManager.inIndexExist()){
   try{
    if(false==indexManager.creatIndex()){
     return searchResult;
    }
   }catch(IOException e){
    e.printStackTrace();
    return searchResult;
   }
  }
  IndexSearcher indexSearcher=null;
  try{
   indexSearcher=new IndexSearcher(indexManager.getIndexDir());
  }catch(IOException e){
   e.printStackTrace();
  }
  QueryParser queryParser=new QueryParser("content",analyzer);
  Query query=null;
  try{
   query=queryParser.parse(searchWord);
  }catch(ParseException e){
   e.printStackTrace();
  }
  if(null!=query&&null!=indexSearcher){
   try{
    Hits hits=indexSearcher.search(query);
    for(int i=0;i<hits.length();i++){
     SearchResultBean resultBean=new SearchResultBean();
     resultBean.setHtmlPath(hits.doc(i).get("path"));
     resultBean.setHtmlTitle(hits.doc(i).get("title"));
     searchResult.add(resultBean);
    }
   }catch(IOException e){
    e.printStackTrace();
   }
  }
   return searchResult;
 }

}


请求管理器的实现

package com.rain.servlet;

import Java.io.IOException;
import Java.util.List;

import javax.servlet.RequestDispatcher;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import com.rain.search.SearchManager;

/**
 * @author zhourui
 * 2007-1-28
 */
public class SearchController extends HttpServlet {
 private static final long serialVersionUID=1L;
 
 /* (non-Javadoc)
  * @see javax.servlet.http.HttpServlet#doPost(javax.servlet.http.HttpServletRequest, javax.servlet.http.HttpServletResponse)
  */
 @Override
 protected void doPost(HttpServletRequest arg0, HttpServletResponse arg1) throws ServletException, IOException {
  // TODO Auto-generated method stub
  String searchWord=arg0.getParameter("searchWord");
  SearchManager searchManager=new SearchManager(searchWord);
  List searchResult=null;
  searchResult=searchManager.search();
  RequestDispatcher dispatcher=arg0.getRequestDispatcher("search.jsp");
  arg0.setAttribute("searchResult",searchResult);
        dispatcher.forward(arg0, arg1);
 }
 
}




向Web服务器提交搜索请求
<form action="SearchController" method="post">
      <table>
        <tr>
          <td colspan="3">
            SearchWord:<input type="text" name="searchWord" id="searchWord" size="40">
            <input id="doSearch" type="submit" value="search">
          </td>
        </tr>
      </table>
    </form>
显示搜索结果
 <table class="result">
      <%
        List searchResult=(List)request.getAttribute("searchResult");
        int resultCount=0;
        if(null!=searchResult){
         resultCount=searchResult.size();
        }
        for(int i=0;i<resultCount;i++){
         SearchResultBean resultBean=(SearchResultBean)searchResult.get(i);
         String title=resultBean.getHtmlTitle();
         String path=resultBean.getHtmlPath();
         %>
         <tr>
           <td class="title"><h3><a href="<%=path%>"><%=title%></a></h3></td>
         </tr>
         <%
        }
      %>
    </table>
posted on 2007-01-29 09:57 周锐 阅读(830) 评论(0)  编辑  收藏 所属分类: Lucene

只有注册用户登录后才能发表评论。


网站导航: