2008年4月4日随笔档案 - 菜鸟上路，多多指教

索引读写器_FSDiractory模式

package indexer;
//package ch2.lucenedemo.process;

import java.io.File;

import java.io.IOException;
import java.util.ArrayList;
import java.util.logging.Level;
import java.util.logging.Logger;
import jeasy.analysis.MMAnalyzer;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import parameters.Param;
import pretreat.FileControl;

public class IndexOnFS implements IIndexTool{
    // 成员变量存储创建的索引文件存放的位置
    private String INDEX_STORE_PATH = Param.INDEX_STORE_PATH;
        //建立索引的目标文件
        private String INDEX_WANTED_PATH = "e:\\";
        //目录数组总数
        private int NumOfDir = 0;
        //存放根目录下的所有子目录
        private ArrayList<String> DirList = new ArrayList<String>();
        //地址映射
        private Directory dir = null;

        private IndexWriter writer;

        public IndexOnFS(String path){
                try {
                    dir = FSDirectory.getDirectory(INDEX_STORE_PATH);
                } catch (IOException ex) {
                    Logger.getLogger(IndexOnFS.class.getName()).log(Level.SEVERE, null, ex);
                }
                INDEX_WANTED_PATH = path;
                makeSegments();
                searchDirectorys(path);
        }

        //建立索引之前遍历所有目录并存放，这是为了迎合IndexWriter的同步机制
        public void searchDirectorys(String rootDir){

                File rootfile = new File(rootDir);
                File[] files = rootfile.listFiles();
                if(files!=null)
                for (int i = 0; i < files.length; i++){
                    if(files[i].isDirectory()){
                       DirList.add(files[i].getPath());
                       searchDirectorys(files[i].getPath());
                    }
                }
        }
        public void printAllDirectorys(){
                for(int i = 0;i<DirList.size();i++)
                       System.out.println(DirList.get(i));
        }
        public void createIndexs() {
                createIndex(INDEX_WANTED_PATH);
                for(int k = 0;k<DirList.size();k++)
                    createIndex(DirList.get(k));
        }
        public Document preIndexWrite(File file){
                // 创建一个新的Document
            Document doc = new Document();
            // 文件名对应的Field
            Field field = new Field("filename", file.getName(),
                                 Field.Store.YES, Field.Index.TOKENIZED);
        doc.add(field);
        // 文件内容对应的Filed
        field = new Field("content", FileControl.fileToString(file),//转到控制器
                   Field.Store.NO, Field.Index.TOKENIZED);
        doc.add(field);
                //文件路径对应的Filed
                field = new Field("filepath", file.getPath(),
                           Field.Store.YES, Field.Index.TOKENIZED);
        doc.add(field);

                return doc;
        }
    /*单目录创建索引*/
    public void createIndex(String inputDir) {
        try {
            /*MMAnalyzer作为分词工具创建一个IndexWriter*/
                writer = new IndexWriter(dir,new MMAnalyzer(), false); /*第一次创建索引时为true*/
            File filesDir = new File(inputDir);
            /*取得所有需要建立索引的文件数组*/
            File[] files = filesDir.listFiles();
            /*遍历数组*/
                        if(files!=null)
            for (int i = 0; i < files.length; i++) {
                              /*判断是否为文件*/
                              if(files[i].isFile()){
                    /*把Document加入IndexWriter*/
                    writer.addDocument(preIndexWrite(files[i]));
                                        System.out.println( files[i].getPath());
                        }
                                    }
                        writer.optimize(); /*索引优化*/

        } catch (Exception e) { e.printStackTrace(); }

                finally{
                    try{writer.close();
                    }catch(Exception ee){ ee.printStackTrace(); }
                }

    }
        //初始化空索引库
        public void makeSegments(){
            if(new File(INDEX_STORE_PATH).list().length==0){
                try {
                    IndexWriter iw = new IndexWriter(dir, new MMAnalyzer(), true);
                    writer.addDocument(preIndexWrite(new File(Param.INITFILE_PATH)));
                } catch (Exception ex) { ex.printStackTrace(); }

                finally{
                      try{writer.close();
                      }catch(Exception ee){ ee.printStackTrace(); }
                  }
            }
        }
        public ArrayList getDirs(){
            return this.DirList;
        }

        public void startIndex() {
            makeSegments();
            createIndexs();
        }

    public static void main(String[] args) {
        IndexOnFS processor = new IndexOnFS("e:\\毕业论文");
                //processor.searchDirectorys("e:\\1");
                processor.startIndex();
    }

}

posted @ 2008-06-01 05:11 HanLab 阅读(248) | 评论 (0) | 编辑收藏

Lucene源码结构

这是我做毕业设计时画的，感觉还可以就放过来共享，有什么不足的地方，请多指点。

看Lucene代码也算是中享受，根据下图可以先看看关键类的代码。
Lucene2.3.1发布不久，源代码下载地址：http://apache.mirror.phpchina.com/lucene/java/

org.apache.Lucene.search/	搜索入口
org.apache.Lucene.index/	索引入口
org.apache.Lucene.analysis/	语言分析器
org.apache.Lucene.queryParser/	查询分析器
org.apache.Lucene.document/	存储结构
org.apache.Lucene.store/	底层IO/存储结构
org.apache.Lucene.util/	一些公用的数据结构

posted @ 2008-04-06 17:02 HanLab 阅读(429) | 评论 (0) | 编辑收藏

JDBC批量插入数据(批量插入指定目录下的文件信息)

/*首先利用PL/SQL创建一个过程

CREATE OR REPLACE PROCEDURE INS_FILES(P1 IN VARCHAR2,P2 IN VARCHAR2,P3 IN number)
AS
BEGIN
INSERT INTO tb_files(fid,fname,fpath,indexed) VALUES (SEQ_fid.nextval,P1,P2,P3);
END INS_FILES;*/

public static void recursion(String path){

        File file=new File(path);
        File[] files=file.listFiles();
        try{
              for(int i=0;i<files.length;i++)
                {
                   if(files[i].isFile())
                         {
                              conn.callablestatement.setString(1,files[i].getName().toString());
                              conn.callablestatement.setString(2,files[i].getPath().toString());
                              conn.callablestatement.setInt(3,0);
                              conn.executeCall();
                              count++;
                          }
                   else if(files[i].isDirectory())
                             recursion(files[i].getAbsolutePath());//对于目录进一步检索

                 }
             }catch(Exception e){ e.printStackTrace(); }
      }
    public void storeFilesToDB(String rootpath){
       conn = new JDBCConnection();
       conn.setCallableStatement(SQL.call_ins_files);
       recursion(rootpath);
       conn.close();
       System.out.println("共有"+count+"个文件.");
    }

posted @ 2008-04-04 06:36 HanLab 阅读(910) | 评论 (0) | 编辑收藏

Oracle批量插入数据用Varry，效率很低。

public static String Procedure_Ins_path(){
    String dir = "e:\\";
    IndexProcesser p;
    p = new IndexProcesser();
    p.searchDirectorys(dir,true);
    String s0 ="DROP SEQUENCE SEQ_pid;" +
               " CREATE SEQUENCE SEQ_pid INCREMENT BY 1 START WITH 1 NOCYCLE; ";
    String s1 = "declare " +
               "type path_varray is varray(2000) of varchar2(200); " +
               "p_v path_varray:=path_varray (";
    String s3 = "); begin " +
                      "for i in 1..2000 loop " +
                         "if p_v(i) = 'null' then " +
                             "p_v(i):='null'; " +
                         "else " +
                             "INSERT INTO tb_allpath (ID ,path) VALUES (SEQ_PID.NEXTVAL,p_v(i));"+
                         "end if; " +
                       "end loop;" +
                    "commit;" +
                    "end;";
    String s2 ="";
           for(int i = 0;i<1999;i++)
               s2 = s2+"'"+p.Directorys[i]+"',";
           s2 = s2 +"'"+ p.Directorys[1999]+"'";
           return s1+s2+s3;
}

posted @ 2008-04-04 06:30 HanLab 阅读(921) | 评论 (0) | 编辑收藏

菜鸟上路，多多指教

导航

常用链接

留言簿(1)

随笔分类

随笔档案

搜索

最新评论

阅读排行榜

评论排行榜

索引读写器_FSDiractory模式

Lucene源码结构

JDBC批量插入数据(批量插入指定目录下的文件信息)

Oracle批量插入数据用Varry，效率很低。