梦幻e家人

java咖啡
随笔 - 15, 文章 - 0, 评论 - 11, 引用 - 0
数据加载中……

全文检索

package searchfileexample;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Date;
import org.apache.lucene.demo.FileDocument;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import java.io.FileReader;
import org.apache.lucene.index.*;
import java.text.DateFormat;
import org.apache.poi.hdf.extractor.WordDocument;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.PrintWriter;
import java.io.FileInputStream;
import java.io.*;
import org.textmining.text.extraction.WordExtractor;

/**
 * 给某个目录下的所有文件生成索引
 * <p>Title: </p>
 * <p>Description: </p>
 * <p>Copyright: Copyright (c) 2007</p>
 * <p>Company: </p>
 * @author not attributable
 * @version 1.0
 * 根据文件的不同,可以把索引文件创建到不同的文件夹下去,这样可以分类保存索引信息。
 */

/** Index all text files under a directory. */
public class IndexFiles {

  private IndexFiles() {}

  static final File INDEX_DIR = new File("index");

  /** Index all text files under a directory. */
  public static void main(String[] args) {
    String usage = "java org.apache.lucene.demo.IndexFiles <root_directory>";
    //String[] arg = {"a","b"};
    //System.out.println(arg[0]);
    /*
         if (args.length == 0) {
      System.err.println("Usage: " + usage);
      System.exit(1);
         }*/
    /*
        if (INDEX_DIR.exists()) {
          System.out.println("Cannot save index to '" +INDEX_DIR+ "' directory, please delete it first");
          System.exit(1);
        }*/

    final File docDir = new File("a"); //需要生成索引的文件的文件夹
    if (!docDir.exists() || !docDir.canRead()) {
      System.out.println("Document directory '" + docDir.getAbsolutePath() +
                         "' does not exist or is not readable, please check the path");
      System.exit(1);
    }

    Date start = new Date();
    try {
      IndexWriter writer = new IndexWriter(INDEX_DIR, new StandardAnalyzer(), true); //true-覆盖原有的索引 false-不覆盖原有的索引
      System.out.println("Indexing to directory '" + INDEX_DIR + "'...");
      indexDocs(writer, docDir);
      System.out.println("Optimizing...");
      writer.optimize();
      writer.close();

      Date end = new Date();
      System.out.println(end.getTime() - start.getTime() +
                         " total milliseconds");

    }
    catch (IOException e) {
      System.out.println(" caught a " + e.getClass() +
                         "\n with message: " + e.getMessage());
    }
  }

  static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
      if (file.isDirectory()) {
        String[] files = file.list();
        // an IO error could occur
        if (files != null) {
          for (int i = 0; i < files.length; i++) {
            indexDocs(writer, new File(file, files[i]));
          }
        }
      }
      else {
        System.out.println("adding " + file);
        try {

          writer.addDocument(getDocument2(file, new FileInputStream(file)));
          //writer.addDocument(parseFile(file));

          //writer.addDocument(FileDocument.Document(file));//path 存放文件的相对路径
        }
        // at least on windows, some temporary files raise this exception with an "access denied" message
        // checking if the file can be read doesn't help
        catch (Exception fnfe) {
          ;
        }
      }
    }
  }

  /**
   *@paramfile
   *
   *把File变成Document
   */
  static Document parseFile(File file) throws Exception {
    Document doc = new Document();
    doc.add(new Field("path", file.getAbsolutePath(), Field.Store.YES,
                      Field.Index.UN_TOKENIZED)); //取文件的绝对路径
    try {
      doc.add(new Field("contents", new FileReader(file))); //索引文件内容
      doc.add(new Field("title", file.getName(), Field.Store.YES,
                        Field.Index.UN_TOKENIZED));
      //索引最后修改时间
      doc.add(new Field("modified",
                        String.valueOf(DateFormat.
                                       getDateTimeInstance().format(new
          Date(file.lastModified()))), Field.Store.YES,
                        Field.Index.UN_TOKENIZED));
      //doc.removeField("title");
    }
    catch (Exception e) {
      e.printStackTrace();
    }
    return doc;
  }

  /**
   *@paramfile
   *
   *转换word文档

         static String changeWord(File file) throws Exception {
    String re = "";
    try {
      WordDocument wd = new WordDocument(is);
        StringWriter docTextWriter = new StringWriter();
        wd.writeAllText(new PrintWriter(docTextWriter));
        docTextWriter.close();
        bodyText = docTextWriter.toString();

    } catch (Exception e) {
        e.printStackTrace();
    }
    return re;
         }*/
  /**
   *@paramfile
   *
   *使用POI读取word文档
   */
  static Document getDocument(File file, FileInputStream is) throws Exception {

    String bodyText = null;

    try {

      //BufferedReader wt = new BufferedReader(new InputStreamReader(is));
      //bodyText = wt.readLine();
      //System.out.println("word ===="+bodyText);

      WordDocument wd = new WordDocument(is);
      StringWriter docTextWriter = new StringWriter();
      wd.writeAllText(new PrintWriter(docTextWriter));
      bodyText = docTextWriter.toString();
      docTextWriter.close();
      //   bodyText   =   new   WordExtractor().extractText(is);
      System.out.println("word content====" + bodyText);
    }
    catch (Exception e) {
      ;

    }

    if ( (bodyText != null)) {
      Document doc = new Document();
      doc.add(new Field("path", file.getAbsolutePath(), Field.Store.YES,
                        Field.Index.UN_TOKENIZED)); //取文件的绝对路径
      doc.add(new Field("contents", bodyText, Field.Store.YES,
                        Field.Index.TOKENIZED));

      return doc;
    }
    return null;
  }

  //Document   doc   =   getDocument(new   FileInputStream(new   File(file)));
  /**
   *@paramfile
   *
   *使用tm-extractors-0.4.jar读取word文档
   */
  static Document getDocument2(File file, FileInputStream is) throws Exception {

    String bodyText = null;

    try {

      //FileInputStream in = new FileInputStream("D:/lfy_programe/全文检索/SearchFileExample/a/aa.doc");
      //  FileInputStream in = new FileInputStream ("D:/szqxjzhbase/技术测试/新建 Microsoft Word 文档.doc");
      WordExtractor extractor = new WordExtractor();
      System.out.println(is.available());

      bodyText = extractor.extractText(is);

//    System.out.println("the result length is"+str.length());
      System.out.println("word content===="+bodyText);

    }
    catch (Exception e) {
      ;

    }

    if ( (bodyText != null)) {
      Document doc = new Document();
      doc.add(new Field("path", file.getAbsolutePath(), Field.Store.YES,
                        Field.Index.UN_TOKENIZED)); //取文件的绝对路径
      doc.add(new Field("contents", bodyText, Field.Store.YES,
                        Field.Index.TOKENIZED));

      return doc;
    }
    return null;
  }

}


 

package searchfileexample;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.FilterIndexReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;


import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Date;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.KeywordAnalyzer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Fieldable;

/** Simple command-line based search demo. */
public class SearchFiles {

  /** Use the norms from one field for all fields.  Norms are read into memory,
   * using a byte of memory per document per searched field.  This can cause
   * search of large collections with a large number of fields to run out of
   * memory.  If all of the fields contain only a single token, then the norms
   * are all identical, then single norm vector may be shared. */
  private static class OneNormsReader extends FilterIndexReader {
    private String field;

    public OneNormsReader(IndexReader in, String field) {
      super(in);
      this.field = field;
    }

    public byte[] norms(String field) throws IOException {
      return in.norms(this.field);
    }
  }

  private SearchFiles() {}

  /** Simple command-line based search demo. */
  public static void main(String[] arg) throws Exception {
    String[] args = {"a","b"};
    String usage =
      "Usage: java org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-raw] [-norms field]";
    if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) {
      System.out.println(usage);
      System.exit(0);
    }

    String index = "index";//该值是用来存放生成的索引文件的文件夹的名称,不能改动
    String field = "contents";//不能修改  field  的值
    String queries = null;//是用来存放需要检索的关键字的一个文件。
    queries = "D:/lfy_programe/全文检索/SearchFileExample/aa.txt";

    int repeat = 1;
    boolean raw = false;
    String normsField = null;

    for (int i = 0; i < args.length; i++) {
      if ("-index".equals(args[i])) {
        index = args[i+1];
        i++;
      } else if ("-field".equals(args[i])) {
        field = args[i+1];
        i++;
      } else if ("-queries".equals(args[i])) {
        queries = args[i+1];
        i++;
      } else if ("-repeat".equals(args[i])) {
        repeat = Integer.parseInt(args[i+1]);
        i++;
      } else if ("-raw".equals(args[i])) {
        raw = true;
      } else if ("-norms".equals(args[i])) {
        normsField = args[i+1];
        i++;
      }
    }

    IndexReader reader = IndexReader.open(index);

    if (normsField != null)
      reader = new OneNormsReader(reader, normsField);

    Searcher searcher = new IndexSearcher(reader);//用来打开索引文件
    Analyzer analyzer = new StandardAnalyzer();//分析器
    //Analyzer analyzer = new StandardAnalyzer();

    BufferedReader in = null;
    if (queries != null) {
      in = new BufferedReader(new FileReader(queries));
    } else {
      in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
    }
      QueryParser parser = new QueryParser(field, analyzer);
    while (true) {
      if (queries == null)                        // prompt the user
        System.out.println("Enter query: ");

      String line = in.readLine();//组成查询关键字字符串
      System.out.println("查询字符串==="+line);

      if (line == null || line.length() == -1)
        break;

      line = line.trim();
      if (line.length() == 0)
        break;

      Query query = parser.parse(line);
      System.out.println("Searching for: " + query.toString(field));//每个关键字

      Hits hits = searcher.search(query);

      if (repeat > 0) {                           // repeat & time as benchmark
        Date start = new Date();
        for (int i = 0; i < repeat; i++) {
          hits = searcher.search(query);
        }
        Date end = new Date();
        System.out.println("Time: "+(end.getTime()-start.getTime())+"ms");
      }

      System.out.println("查询到:" + hits.length() + " 个含有 ["+query.toString(field)+"]的文档");

      final int HITS_PER_PAGE = 10;//查询返回的最大记录数
      int currentNum = 2;//当前记录数
      for (int start = 0; start < hits.length(); start += HITS_PER_PAGE) {
        //start = start + currentNum;
        int end = Math.min(hits.length(), start + HITS_PER_PAGE);
        for (int i = start; i < end; i++) {

          //if (raw) {                              // output raw format
            System.out.println("doc="+hits.id(i)+" score="+hits.score(i));//score是接近度的意思
            //continue;
          //}

          Document doc = hits.doc(i);
          String path = doc.get("path");


          if (path != null) {
            System.out.println((i+1) + ". " + path);
            String title = doc.get("title");
            System.out.println("   modified: " + doc.get("modified"));
            if (title != null) {
              System.out.println("   Title: " + doc.get("title"));
            }
          } else {
            System.out.println((i+1) + ". " + "No path for this document");
          }
        }

        if (queries != null)                      // non-interactive
          break;

        if (hits.length() > end) {
          System.out.println("more (y/n) ? ");
          line = in.readLine();
          if (line.length() == 0 || line.charAt(0) == 'n')
            break;
        }
      }
    }
    reader.close();
  }
}


 

package searchfileexample;

import javax.servlet.*;
import javax.servlet.http.*;
import java.io.*;
import java.util.*;
import org.textmining.text.extraction.WordExtractor;

public class ReadWord extends HttpServlet {
  private static final String CONTENT_TYPE = "text/html; charset=GBK";

  //Initialize global variables
  public void init() throws ServletException {
  }

  //Process the HTTP Get request
  public void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
    response.setContentType(CONTENT_TYPE);
    FileInputStream in = new FileInputStream ("D:/lfy_programe/全文检索/SearchFileExample/a/aa.doc");
       //  FileInputStream in = new FileInputStream ("D:/szqxjzhbase/技术测试/新建 Microsoft Word 文档.doc");
   WordExtractor extractor = new WordExtractor();
   System.out.println(in.available());
  String str = null;
  try {
    str = extractor.extractText(in);
  }
  catch (Exception ex) {
  }
//    System.out.println("the result length is"+str.length());
   System.out.println(str);

  }

  //Clean up resources
  public void destroy() {
  }
}

1.英文的模糊查询问题
查询时的关键字的后边加上通配符  " * " 就可以了。

2.IndexFiles.java
用来索引文件的java类

3.SearchFiles.java
用来搜索的java类

4.ReadWord.java
使用tm-extractors-0.4.jar来读取word文件


 

 

posted on 2008-03-18 10:35 轩辕 阅读(259) 评论(0)  编辑  收藏 所属分类: java


只有注册用户登录后才能发表评论。


网站导航: