protected Query getFieldQuery(String field, String queryText) throws ParseException { //需要用analyzer对文本进行分词 TokenStream source; try { source = analyzer.reusableTokenStream(field, new StringReader(queryText)); source.reset(); } catch (IOException e) { source = analyzer.tokenStream(field, new StringReader(queryText)); } CachingTokenFilter buffer = new CachingTokenFilter(source); TermAttribute termAtt = null; PositionIncrementAttribute posIncrAtt = null; int numTokens = 0; boolean success = false; try { buffer.reset(); success = true; } catch (IOException e) { } //得到TermAttribute和PositionIncrementAttribute,此两项将决定到底产生什么样的Query对象 if (success) { if (buffer.hasAttribute(TermAttribute.class)) { termAtt = buffer.getAttribute(TermAttribute.class); } if (buffer.hasAttribute(PositionIncrementAttribute.class)) { posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class); } } int positionCount = 0; boolean severalTokensAtSamePosition = false; boolean hasMoreTokens = false; if (termAtt != null) { try { //遍历分词后的所有Token,统计Tokens的个数numTokens,以及positionIncrement的总数,即positionCount。 //当有一次positionIncrement为0的时候,severalTokensAtSamePosition设为true,表示有多个Token处在同一个位置。 hasMoreTokens = buffer.incrementToken(); while (hasMoreTokens) { numTokens++; int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1; if (positionIncrement != 0) { positionCount += positionIncrement; } else { severalTokensAtSamePosition = true; } hasMoreTokens = buffer.incrementToken(); } } catch (IOException e) { } } try { //重设buffer,以便生成phrase查询的时候,term和position可以重新遍历。 buffer.reset(); source.close(); } catch (IOException e) { } if (numTokens == 0) return null; else if (numTokens == 1) { //如果分词后只有一个Token,则生成TermQuery String term = null; try { boolean hasNext = buffer.incrementToken(); term = termAtt.term(); } catch (IOException e) { } return newTermQuery(new Term(field, term)); } else { //如果分词后不只有一个Token if (severalTokensAtSamePosition) { //如果有多个Token处于同一个位置 if (positionCount == 1) { //并且处于同一位置的Token还全部处于第一个位置,则生成BooleanQuery,处于同一位置的Token之间是OR的关系 BooleanQuery q = newBooleanQuery(true); for (int i = 0; i < numTokens; i++) { String term = null; try { boolean hasNext = buffer.incrementToken(); term = termAtt.term(); } catch (IOException e) { } Query currentQuery = newTermQuery(new Term(field, term)); q.add(currentQuery, BooleanClause.Occur.SHOULD); } return q; } else { //如果有多个Token处于同一位置,但不是第一个位置,则生成MultiPhraseQuery。 //所谓MultiPhraseQuery即其可以包含多个phrase,其又一个ArrayList<Term[]> termArrays,每一项都是一个Term的数组,属于同一个数组的Term表示在同一个位置。它有函数void add(Term[] terms)一次添加一个数组的Term。比如我们要搜索"microsoft app*",其表示多个phrase,"microsoft apple","microsoft application"都算。此时用QueryParser.parse("\"microsoft app*\"")从而生成PhraseQuery是搜不出microsoft apple和microsoft application的,也不能搜出microsoft app,因为*一旦被引号所引,就不算通配符了。所以必须生成MultiPhraseQuery,首先用add(new Term[]{new Term("field", "microsoft")})将microsoft作为一个Term数组添加进去,然后用add(new Term[]{new Term("field", "app"), new Term("field", "apple"), new Term("field", "application")})作为一个Term数组添加进去(算作同一个位置的),则三者都能搜的出来。 MultiPhraseQuery mpq = newMultiPhraseQuery(); mpq.setSlop(phraseSlop); List<Term> multiTerms = new ArrayList<Term>(); int position = -1; for (int i = 0; i < numTokens; i++) { String term = null; int positionIncrement = 1; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.term(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { } if (positionIncrement > 0 && multiTerms.size() > 0) { //如果positionIncrement大于零,说明此Term和前一个Term已经不是同一个位置 了,所以原来收集在multiTerms中的Term都算作同一个位置,添加到MultiPhraseQuery中作为一项。并清除 multiTerms,以便重新收集相同位置的Term。 if (enablePositionIncrements) { mpq.add(multiTerms.toArray(new Term[0]),position); } else { mpq.add(multiTerms.toArray(new Term[0])); } multiTerms.clear(); } //将此Term收集到multiTerms中。 position += positionIncrement; multiTerms.add(new Term(field, term)); } //当遍历完所有的Token,同处于最后一个位置的Term已经收集到multiTerms中了,把他们加到MultiPhraseQuery中作为一项。 if (enablePositionIncrements) { mpq.add(multiTerms.toArray(new Term[0]),position); } else { mpq.add(multiTerms.toArray(new Term[0])); } return mpq; } } else { //如果不存在多个Token处于同一个位置的情况,则直接生成PhraseQuery PhraseQuery pq = newPhraseQuery(); pq.setSlop(phraseSlop); int position = -1; for (int i = 0; i < numTokens; i++) { String term = null; int positionIncrement = 1; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.term(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { } if (enablePositionIncrements) { position += positionIncrement; pq.add(new Term(field, term),position); } else { pq.add(new Term(field, term)); } } return pq; } } } |