Lucene In Action ch 5 笔记 --高级搜索技术
----- 2006-2-15
该章介绍了Lucene的一些高级技术,如 结果排序,搜索多个Index,过虑技术....下面就看看这些高级技巧吧.
I.Sorting search results
在Lucene中有两种特别的类型是用来排序的:Score和Index order
要排序结果 可以使用IndexSearcher的重载serach函数,提供一个Sort参数.看个例子.
SortingExample.java
01
package lia.advsearching;
02
03
import org.apache.commons.lang.StringUtils;
04
import org.apache.lucene.document.Document;
05
import org.apache.lucene.index.Term;
06
import org.apache.lucene.search.Hits;
07
import org.apache.lucene.search.IndexSearcher;
08
import org.apache.lucene.search.Query;
09
import org.apache.lucene.search.RangeQuery;
10
import org.apache.lucene.search.Sort;
11
import org.apache.lucene.search.SortField;
12
import org.apache.lucene.store.Directory;
13
import org.apache.lucene.store.FSDirectory;
14
15
import java.io.IOException;
16
import java.text.DecimalFormat;
17
18
public class SortingExample {
19
private Directory directory;
20
21
public SortingExample(Directory directory) {
22
this.directory = directory;
23
}
24
//
显示搜索结果
25
public void displayHits(Query query, Sort sort)
26
throws IOException {
27
IndexSearcher searcher = new IndexSearcher(directory);
28
29
Hits hits = searcher.search(query, sort); //
安
sort
来排序搜索结果
30
31
System.out.println("\nResults for: " +
32
query.toString() + " sorted by " + sort); //
打印
query
和
sort
33
34
System.out.println(StringUtils.rightPad("Title", 30) + //
使用
StringUtils(
来自
Apache commons)
打印结果
35
StringUtils.rightPad("pubmonth", 10) +
36
StringUtils.center("id", 4) +
37
StringUtils.center("score", 15));
38
39
DecimalFormat scoreFormatter = new DecimalFormat("0.######");
40
for (int i = 0; i < hits.length(); i++) { //
打印结果
41
Document doc = hits.doc(i);
42
System.out.println(
43
StringUtils.rightPad(
44
StringUtils.abbreviate(doc.get("title"), 29), 30) +
45
StringUtils.rightPad(doc.get("pubmonth"), 10) +
46
StringUtils.center("" + hits.id(i), 4) +
47
StringUtils.leftPad(
48
scoreFormatter.format(hits.score(i)), 12));
49
System.out.println(" " + doc.get("category"));
50
// System.out.println(searcher.explain(query, hits.id(i)));
51
}
52
53
searcher.close();
54
}
55
56
public static void main(String[] args) throws Exception {
57
Term earliest = new Term("pubmonth", "190001");
58
Term latest = new Term("pubmonth", "201012");
59
RangeQuery allBooks = new RangeQuery(earliest, latest, true); // query
60
61
String indexDir = System.getProperty("index.dir"); // index
的目录
62
63
FSDirectory directory =
64
FSDirectory.getDirectory(indexDir, false);
65
SortingExample example = new SortingExample(directory);
66
67
example.displayHits(allBooks, Sort.RELEVANCE); //
使用
Lucene
默认的排序
68
69
example.displayHits(allBooks, Sort.INDEXORDER); //
根据
IndexOrder
排序
70
71
example.displayHits(allBooks, new Sort("category")); //
根据
category
排序
72
73
example.displayHits(allBooks, new Sort("pubmonth", true)); //
根据
pubmonth
排序
74
75
example.displayHits(allBooks,
76
new Sort(new SortField[]{
77
new SortField("category"),
78
SortField.FIELD_SCORE,
79
new SortField("pubmonth", SortField.INT, true)
80
})); ///
81
82
83
example.displayHits(allBooks, new Sort(new SortField[] {SortField.FIELD_SCORE, new SortField("category")}));
84
}
85
}
当sort 参数是null ,new Sort(),和Sort.RELEVANCE 时,使用的是Lucene的默认排序(按照Relevance的递减排序), 默认搜索的结果如下:
先按照Score递减排序 如果Score相同则按照Docnum 递增排序.
If the order documents were indexed is relevant, you can use
Sort.INDEXORDER
.
下面是其输出结果:(安装ID来排序)
要利用Field排序,该field要满足第二章排序(参考我的Blog上的内容)的要求. 下面是使用category field的输出.
默认的field排序是按照自然排序,利用Sort的重载函数,提供一个reverse参数可以改变顺序.结果如下:
example.displayHits(allBooks, new Sort("pubmonth", true));
提供了
true
参数
.
还可以根据多个
Field
排序
.
用法如下
:
example.displayHits(allBooks,
new Sort(new SortField[]{
new SortField("category"),
SortField.FIELD_SCORE,
new SortField("pubmonth", SortField.INT, true)
}));
结果如下
:
当使用
SortField.STRING
类型来排序时
,
结果可能会跟
Locale
有关
,
可以使用如下方法设置
public SortField (String field, Locale locale)
public SortField (String field, Locale locale, boolean reverse)
在排序时候
,
要占用更多的资源
.
这一点值得注意
.
II.
使用
PhrasePrefixQuery
PhrasePrefixQuery
可以说是
PhraseQuery
的一个增强版
,
可以在同一个位置放置多个
term,slop
设置和
PhraseQuery
的一样
.
看个例子
01
package lia.advsearching;
02
03
import junit.framework.TestCase;
04
import org.apache.lucene.analysis.WhitespaceAnalyzer;
05
import org.apache.lucene.document.Document;
06
import org.apache.lucene.document.Field;
07
import org.apache.lucene.index.IndexWriter;
08
import org.apache.lucene.index.Term;
09
import org.apache.lucene.search.BooleanQuery;
10
import org.apache.lucene.search.Hits;
11
import org.apache.lucene.search.IndexSearcher;
12
import org.apache.lucene.search.PhrasePrefixQuery;
13
import org.apache.lucene.search.PhraseQuery;
14
import org.apache.lucene.store.RAMDirectory;
15
16
import java.io.IOException;
17
18
public class PhrasePrefixQueryTest extends TestCase {
19
private IndexSearcher searcher;
20
21
protected void setUp() throws Exception {
22
RAMDirectory directory = new RAMDirectory();
23
IndexWriter writer = new IndexWriter(directory,
24
new WhitespaceAnalyzer(), true);
25
Document doc1 = new Document();
26
doc1.add(Field.Text("field",
27
"the quick brown fox jumped over the lazy dog")); ///
一个文档含有
quick fox
28
writer.addDocument(doc1);
29
Document doc2 = new Document();
30
doc2.add(Field.Text("field",
31
"the fast fox hopped over the hound")); ///
另一个文档含有
fast fox
32
writer.addDocument(doc2);
33
writer.close();
34
35
searcher = new IndexSearcher(directory);
36
}
37
38
public void testBasic() throws Exception {
39
PhrasePrefixQuery query = new PhrasePrefixQuery(); //
构造一个
PhrasePrefixQuery
40
query.add(new Term[] { //
搜索一个
含有
quick fox
或者
fast fox
的文档
41
new Term("field", "quick"),
42
new Term("field", "fast")
43
});
44
query.add(new Term("field", "fox")); //
默认的
slop
45
System.out.println(query);
46
47
Hits hits = searcher.search(query);
48
assertEquals("fast fox match", 1, hits.length());
49
50
query.setSlop(1); //
设置
slop
51
hits = searcher.search(query);
52
assertEquals("both match", 2, hits.length());
53
}
54
55
public void testAgainstOR() throws Exception {
56
PhraseQuery quickFox = new PhraseQuery();
57
quickFox.setSlop(1);
58
quickFox.add(new Term("field", "quick"));
59
quickFox.add(new Term("field", "fox"));
60
61
PhraseQuery fastFox = new PhraseQuery();
62
fastFox.add(new Term("field", "fast"));
63
fastFox.add(new Term("field", "fox"));
64
65
BooleanQuery query = new BooleanQuery(); //
使用
BooleanQuery
和
PhraseQuery
构造和上面等级的搜索条件
66
query.add(quickFox, false, false);
67
query.add(fastFox, false, false);
68
Hits hits = searcher.search(query);
69
assertEquals(2, hits.length());
70
}
71
72
73
private void debug(Hits hits) throws IOException {
74
for (int i=0; i < hits.length(); i++) {
75
Document doc = hits.doc(i);
76
System.out.println(hits.score(i) + ": " + doc.get("field"));
77
}
78
79
}
80
}
注意:
One difference between
PhrasePrefixQuery
and the
BooleanQuery
of
Phrase-
Query
’s approach is that the slop factor is applied globally with
PhrasePrefix-
Query
—it’s applied on a per-phrase basis with
PhraseQuery
.
Lucene’s
QueryParser
doesn’t currently support
PhrasePrefixQuery
.
III.
Querying on multiple fields at once
MultiFieldQueryParser支持对于多个字段进行同一个关键字的搜索. 该类使用比较简单看看例子:
01
package lia.advsearching;
02
03
import lia.common.LiaTestCase;
04
import org.apache.lucene.analysis.SimpleAnalyzer;
05
import org.apache.lucene.queryParser.MultiFieldQueryParser;
06
import org.apache.lucene.search.Hits;
07
import org.apache.lucene.search.IndexSearcher;
08
import org.apache.lucene.search.Query;
09
10
public class MultiFieldQueryParserTest extends LiaTestCase {
11
public void testDefaultOperator() throws Exception {
12
Query query = MultiFieldQueryParser.parse("development",
13
new String[]{"title", "subject"},
14
new SimpleAnalyzer());
15
16
IndexSearcher searcher = new IndexSearcher(directory);
17
Hits hits = searcher.search(query);
18
19
assertHitsIncludeTitle(hits, "Java Development with Ant");
20
21
// has "development" in the subject field
22
assertHitsIncludeTitle(hits, "Extreme Programming Explained");
23
}
24
25
public void testSpecifiedOperator() throws Exception {
26
Query query = MultiFieldQueryParser.parse("development",
27
new String[]{"title", "subject"}, ///
在两个
Field
中搜索
28
new int[]{MultiFieldQueryParser.REQUIRED_FIELD,
29
MultiFieldQueryParser.REQUIRED_FIELD},
30
new SimpleAnalyzer());
31
32
IndexSearcher searcher = new IndexSearcher(directory);
33
Hits hits = searcher.search(query);
34
35
assertHitsIncludeTitle(hits, "Java Development with Ant");
36
assertEquals("one and only one", 1, hits.length());
37
}
38
39
}
注意:
Generally speaking, querying on multiple fields isn’t the best practice for user-entered queries. More commonly, all words you want searched are indexed into a
contents
or
keywords
field by combining various fields. A synthetic
contents
field in our test environment uses this scheme to put author and subjects together:
doc.add(Field.UnStored("contents", author + " " + subjects));
We used a space (
" "
) between author and subjects to separate words for the analyzer. Allowing users to enter text in the simplest manner possible without the need to qualify field names generally makes for a less confusing user experience.
If you choose to use
MultiFieldQueryParser
, be sure your queries are fabricated appropriately using the
QueryParser
and
Analyzer
diagnostic techniques shown in chapters 3 and 4. Plenty of odd interactions with analysis occur using
Query-Parser
, and these are compounded using
MultiFieldQueryParser
.
posted on 2007-01-05 10:25
Lansing 阅读(607)
评论(0) 编辑 收藏