posts - 8, comments - 0, trackbacks - 0, articles - 0

2008年4月4日

 

package indexer;
//package ch2.lucenedemo.process;

import java.io.File;

import java.io.IOException;
import java.util.ArrayList;
import java.util.logging.Level;
import java.util.logging.Logger;
import jeasy.analysis.MMAnalyzer;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import parameters.Param;
import pretreat.FileControl;

public class IndexOnFS implements IIndexTool{
    
// 成员变量存储创建的索引文件存放的位置
    private String INDEX_STORE_PATH = Param.INDEX_STORE_PATH;
        
//建立索引的目标文件
        private String INDEX_WANTED_PATH = "e:\\";
        
//目录数组总数
        private int NumOfDir = 0;
        
//存放根目录下的所有子目录
        private ArrayList<String> DirList = new ArrayList<String>();
        
//地址映射
        private Directory dir = null;
        
        
private IndexWriter writer;
        
        
public IndexOnFS(String path){
                
try {
                    dir 
= FSDirectory.getDirectory(INDEX_STORE_PATH);
                } 
catch (IOException ex) {
                    Logger.getLogger(IndexOnFS.
class.getName()).log(Level.SEVERE, null, ex);
                }
                INDEX_WANTED_PATH 
= path;
                makeSegments();
                searchDirectorys(path);
        }

        
//建立索引之前遍历所有目录并存放,这是为了迎合IndexWriter的同步机制
        public void searchDirectorys(String rootDir){
        
                File rootfile 
= new File(rootDir);
                File[] files 
= rootfile.listFiles();
                
if(files!=null)
                
for (int i = 0; i < files.length; i++){
                    
if(files[i].isDirectory()){
                       DirList.add(files[i].getPath());
                       searchDirectorys(files[i].getPath()); 
                    }
                }
        }
        
public void printAllDirectorys(){
                
for(int i = 0;i<DirList.size();i++)
                       System.out.println(DirList.get(i));
        }
        
public void createIndexs() {
                createIndex(INDEX_WANTED_PATH);
                
for(int k = 0;k<DirList.size();k++)
                    createIndex(DirList.get(k));
        }
        
public Document preIndexWrite(File file){
                
// 创建一个新的Document
            Document doc = new Document();
            
// 文件名对应的Field
            Field field = new Field("filename", file.getName(), 
                                 Field.Store.YES, Field.Index.TOKENIZED); 
        doc.add(field);
        
// 文件内容对应的Filed
        field = new Field("content", FileControl.fileToString(file),//转到控制器
                   Field.Store.NO, Field.Index.TOKENIZED);
        doc.add(field);
                
//文件路径对应的Filed
                field = new Field("filepath", file.getPath(), 
                           Field.Store.YES, Field.Index.TOKENIZED); 
        doc.add(field);
                
                
return doc;
        }
    
/*单目录创建索引*/
    
public void createIndex(String inputDir) {
        
try {
            
/*MMAnalyzer作为分词工具创建一个IndexWriter*/
                writer 
= new IndexWriter(dir,new MMAnalyzer(), false); /*第一次创建索引时为true*/
            File filesDir 
= new File(inputDir);
            
/*取得所有需要建立索引的文件数组*/
            File[] files 
= filesDir.listFiles();
            
/*遍历数组*/
                        
if(files!=null)
            
for (int i = 0; i < files.length; i++) { 
                              
/*判断是否为文件*/
                              
if(files[i].isFile()){ 
                    
/*把Document加入IndexWriter*/
                    writer.addDocument(preIndexWrite(files[i]));  
                                        System.out.println( files[i].getPath());
                        }
                                    } 
                        writer.optimize(); 
/*索引优化*/

        } 
catch (Exception e) { e.printStackTrace(); }
                
                
finally
                    
try{writer.close();
                    }
catch(Exception ee){ ee.printStackTrace(); }
                }

    }
        
//初始化空索引库
        public void makeSegments(){
            
if(new File(INDEX_STORE_PATH).list().length==0){
                
try {
                    IndexWriter iw 
= new IndexWriter(dir, new MMAnalyzer(), true);
                    writer.addDocument(preIndexWrite(
new File(Param.INITFILE_PATH)));
                } 
catch (Exception ex) { ex.printStackTrace(); } 
                
                
finally
                      
try{writer.close();
                      }
catch(Exception ee){ ee.printStackTrace(); }
                  }
            }
        }
        
public ArrayList getDirs(){
            
return this.DirList;
        }
        
        
public void startIndex() {
            makeSegments();
            createIndexs();
        }

    
public static void main(String[] args) {
        IndexOnFS processor 
= new IndexOnFS("e:\\毕业论文");
                
//processor.searchDirectorys("e:\\1");
                processor.startIndex();
    }


}

posted @ 2008-06-01 05:11 HanLab 阅读(220) | 评论 (0)编辑 收藏

这是我做毕业设计时画的,感觉还可以就放过来共享,有什么不足的地方,请多指点。

看Lucene代码也算是中享受,根据下图可以先看看关键类的代码。
Lucene2.3.1发布不久,源代码下载地址:http://apache.mirror.phpchina.com/lucene/java/


 

 

 org.apache.Lucene.search/

 搜索入口

 org.apache.Lucene.index/

 索引入口

 org.apache.Lucene.analysis/

 语言分析器

 org.apache.Lucene.queryParser/

查询分析器

 org.apache.Lucene.document/

 存储结构

 org.apache.Lucene.store/ 

 底层IO/存储结构

 org.apache.Lucene.util/

 一些公用的数据结构


 

 

posted @ 2008-04-06 17:02 HanLab 阅读(404) | 评论 (0)编辑 收藏

   /*首先利用PL/SQL创建一个过程

CREATE OR REPLACE PROCEDURE INS_FILES(P1 IN VARCHAR2,P2 IN VARCHAR2,P3 IN number)
AS
BEGIN
     INSERT INTO tb_files(fid,fname,fpath,indexed) VALUES (SEQ_fid.nextval,P1,P2,P3);
END INS_FILES;*/


 

 public static void recursion(String path){
  
        File file=new File(path);
        File[] files=file.listFiles();
        try{
              for(int i=0;i<files.length;i++)
                { 
                   if(files[i].isFile())
                         {   
                              conn.callablestatement.setString(1,files[i].getName().toString());
                              conn.callablestatement.setString(2,files[i].getPath().toString());
                              conn.callablestatement.setInt(3,0);
                              conn.executeCall();
                              count++;
                          }
                   else if(files[i].isDirectory())
                             recursion(files[i].getAbsolutePath());//对于目录进一步检索
                  
                 }
             }catch(Exception e){ e.printStackTrace(); }
      }
    public void storeFilesToDB(String rootpath){
       conn = new JDBCConnection();
       conn.setCallableStatement(SQL.call_ins_files);
       recursion(rootpath);
       conn.close();
       System.out.println("共有"+count+"个文件.");
    }

posted @ 2008-04-04 06:36 HanLab 阅读(884) | 评论 (0)编辑 收藏

public static String Procedure_Ins_path(){
    String dir = "e:\\";
    IndexProcesser p;
    p = new IndexProcesser();
    p.searchDirectorys(dir,true);
    String s0 ="DROP SEQUENCE  SEQ_pid;" +
               " CREATE SEQUENCE  SEQ_pid INCREMENT BY 1 START WITH 1 NOCYCLE; ";
    String s1 = "declare " +
               "type path_varray is varray(2000) of varchar2(200); " +
               "p_v path_varray:=path_varray (";
    String s3 =  "); begin " +
                      "for i in 1..2000 loop " +
                         "if p_v(i) = 'null' then " +
                             "p_v(i):='null'; " +
                         "else " +
                             "INSERT INTO tb_allpath (ID ,path) VALUES (SEQ_PID.NEXTVAL,p_v(i));"+
                         "end if; " +
                       "end loop;" +
                    "commit;" +
                    "end;";
    String s2 ="";
           for(int i = 0;i<1999;i++)
               s2 = s2+"'"+p.Directorys[i]+"',";
           s2 = s2 +"'"+ p.Directorys[1999]+"'";
           return s1+s2+s3;
}

posted @ 2008-04-04 06:30 HanLab 阅读(896) | 评论 (0)编辑 收藏