比较常用的几种英文分析器,他们之间的区别见程序中的注释。
SimpleAnalyzer
StandardAnalyzer
WhitespaceAnalyzer
StopAnalyzer
package analyzer;

import java.io.Reader;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;


public class TestAnalyzer
{
private static String testString1 = "The quick brown fox jumped over the lazy dogs";
private static String testString2 = "xy&z mail is - xyz@sohu.com";

public static void testWhitespace(String testString) throws Exception
{
Analyzer analyzer = new WhitespaceAnalyzer();
Reader r = new StringReader(testString);
Tokenizer ts = (Tokenizer) analyzer.tokenStream("", r);
System.err.println("=====Whitespace analyzer====");
System.err.println("分析方法:空格分割");
Token t;

while ((t = ts.next()) != null)
{
System.out.println(t.termText());
}
}

public static void testSimple(String testString) throws Exception
{
Analyzer analyzer = new SimpleAnalyzer();
Reader r = new StringReader(testString);
Tokenizer ts = (Tokenizer) analyzer.tokenStream("", r);
System.err.println("=====Simple analyzer====");
System.err.println("分析方法:空格及各种符号分割");
Token t;

while ((t = ts.next()) != null)
{
System.out.println(t.termText());
}
}

public static void testStop(String testString) throws Exception
{
Analyzer analyzer = new StopAnalyzer();
Reader r = new StringReader(testString);
StopFilter sf = (StopFilter) analyzer.tokenStream("", r);
System.err.println("=====stop analyzer====");
System.err.println("分析方法:空格及各种符号分割,去掉停止词,停止词包括 is,are,in,on,the等无实际意义的词");
//停止词
Token t;

while ((t = sf.next()) != null)
{
System.out.println(t.termText());
}
}

public static void testStandard(String testString) throws Exception
{
Analyzer analyzer = new StandardAnalyzer();
Reader r = new StringReader(testString);
StopFilter sf = (StopFilter) analyzer.tokenStream("", r);
System.err.println("=====standard analyzer====");
System.err.println("分析方法:混合分割,包括了去掉停止词,支持汉语");
Token t;

while ((t = sf.next()) != null)
{
System.out.println(t.termText());
}
}

public static void main(String[] args) throws Exception
{
// String testString = testString1;
String testString = testString2;
System.out.println(testString);
testWhitespace(testString);
testSimple(testString);
testStop(testString);
testStandard(testString);
}

}
