一.问题提出:像这样的"[TVB2008][溏心风暴之家好月圆][国语][DVD-RMVB][第01集]|BT285.cn|BT下载|BT电影|BT软件"的标题,我们怎样提取出关键字"家好月圆",当然是在电影或是电视剧的范围内.
        二.不用词典,你用什么方式分词.

解决方法:
      1.针对此网站大部分都是BT下载,一般都是电影,电视剧,动漫.先对整个网站的标题进行Lucene索引.
      2.去掉一些常用符号如:数字,字母,标志符.如"[TVB2008][溏心风暴之家好月圆][国语][DVD-RMVB][第01集]|BT285.cn|BT下载|BT电影|BT软件" 变成"溏心风暴之家好月圆 国语 第 集"
      3.采用二元分法.如"溏心风暴之家好月圆 国语 第 集" 分拆成"溏心 心风 风暴 暴之 之家 家好 好月 圆 国语 第 集"
      4.采用统计方法,对一些高频词去掉,如"国语 第 集" 因为"国语" 出现次数3000次(注:数据库的数据有25万条),"第"与"集"也如此.
      5.对"溏心 心风 风暴 暴之 之家 家好 好月 圆 国语 第 集"进行分析并统计次数.
      6.按最长连接词拼凑.按一定的比例组合划分.
      7.得出"家好月圆"为其关键字

 上代码:

package com.wwy.lucene;

import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.springframework.context.support.ClassPathXmlApplicationContext;

import com.funinhand.orm.database.base.Page;
import com.funinhand.shld.dao.Bt285DAO;
import com.funinhand.shld.dao.WNewsDAO;
import com.funinhand.shld.pojo.Bt285;
import com.funinhand.shld.pojo.WNews;
import com.wwy.thread.Configure;

/**
 * 本代码在下面的网站中测试过并取的60%左右的有效数.
 * 
 * 
http://www.bt285.cn BT下载 
 * 
http://www.yaonba.com.cn NBA中文网
 * 
http://www.5a520.cn 小说520网 
 * 
http://www.vagaaga.cn vagaaga
 *author by wangdei
 *2008-6-12 上午11:16:41
 *
 
*/


public class KeyWordLucene {
    
        
private static Logger logger = Logger.getLogger(KeyWordLucene.class);
        
        
public static String[] StopStrs = {"BT285","BT软件","BT电影","BT下载"};
        
        
        
public static String path = null ;
        
        
private List<DataModel> sentenceList = new ArrayList<DataModel>();
        
        
public KeyWordLucene(){
            logger.debug(
"KeyWordLucene init");
        }

        
public int queryMatchSize(String queryStr,String lucennePath){        
            Searcher searcher 
= null;
            
try {
                searcher 
= new IndexSearcher(lucennePath);// "index"指定索引文件位置
                Document doc = new Document();
                StandardAnalyzer analyzer 
= new StandardAnalyzer(StopStrs);
                
// 一段简单的检索代码
                QueryParser queryParser = new QueryParser("title", analyzer);
                Query query 
= queryParser.parse(queryStr);
                
// 检索    
                Hits hits = searcher.search(query);
                
int size = hits.length();
                searcher.close();
                
return size;
            }
 catch (Exception e) {
                
return 0;
            }
finally{
                 
if(searcher!=null){   
                     
try {   
                         searcher.close();   
                     }
   
                     
catch (Exception e) {}   
                     searcher 
= null;   
                   }
   

            }

            
        }

        
        
        
public boolean stopWord(String word){
            
char charAsi = word.toCharArray()[0];
            
int intAsi = (int)charAsi;
            
if(intAsi<127)
                
return true;
            
/*
             * 65293 -,12289 、,9670 ◆,9733 ★,12300 「,8545 Ⅱ,65281 !,65288 ( ,65306 :,
             
*/

            
int[] asiExpert = {65293,12289,9670,9733,12300,8545,65281,65288,65306};
            
for(int i=0;i<asiExpert.length;i++){
                
if(asiExpert[i] == intAsi)
                    
return true;
            }

            String[] str 
= {"","","[","]","."," ","/","\\","","","?",""};
            
for(int i=0;i<str.length;i++){
                
if(word.equals(str[i]))
                    
return true;
            }

            
return false;
        }

        
        
/**
         * 返回整个句子的集合
         *  cat news Id=211562,【There.Will.Be.Blood.血色黑金.2007.Blu-ray.a1080.x264.DD51 D9】
             血色=165,j=21,time=23
             色黑=19,j=22,time=23
             黑金=86,j=23,time=23
         * 
@param news
         * 
@return
         
*/

        
private void charMatch(WNews news){
            sentenceList.clear();
            String title 
= news.getTitle();
            
int matchSize = 0;
            
int length = title.length();
            
for(int j=0;j<length-2;j++){
                
if(stopWord(title.substring(j,j+1)) || stopWord(title.substring(j+1,j+2)))
                        
continue;
                DataModel dataModel 
= new DataModel();
                
long begin = System.currentTimeMillis();
                matchSize 
=queryMatchSize(title.substring(j,j+2), path);
                
long needsTime = (System.currentTimeMillis()-begin);
                dataModel.setName(title.substring(j,j
+2));
                dataModel.setSize(matchSize);
                dataModel.setLocation(j);
                dataModel.setNeedTime(needsTime);
                
//logger.debug(dataModel.toString());
                sentenceList.add(dataModel);
            }

        }

        
        
/**
         * 提取字符串的.
         * [古天乐刘若英最新钜献][我们这一家][粤语中字][那些发垃圾信息的不是我]
         * [天乐=122,j=2,time=16,
         *  刘若=84,j=4,time=16, 
         *  若英=84,j=5,time=0,
         *  最新=10298,j=7,time=31, 
         * 钜献=11,j=9,time=16, 
         * 我们=441,j=13,time=15, 
         * 一家=518,j=16,time=16, 
         * 语中=31128,j=21,time=63,
         * 中字=36065,j=22,time=78,
         * 那些=17,j=26,time=16, 
         * 垃圾=24,j=29,time=94,
         * 信息=12,j=31,time=0, 
         * 不是=167,j=34,time=15]
         
*/

        
private void pickChar(){
            List
<DataModel> tempList = new ArrayList<DataModel>();
            
int size = sentenceList.size();
            
//取二元比较字符串
            int i = 0;
            
while(i<size){                                    
                DataModel dataModel1 
= sentenceList.get(i);
                
if(i == size-1){
                    tempList.add(dataModel1);
                    sentenceList.clear();
                    setSentenceList(tempList);
                    
return;
                }

                DataModel dataModel2 
= sentenceList.get(i+1);
                
int location1 = dataModel1.getLocation();
                
int location2 = dataModel2.getLocation();
                
//如果是相邻的
                if(location1 +1  == location2){
                    
if(dataModel1.getSize()>=dataModel2.getSize())
                        tempList.add(dataModel1);
                    
else
                        tempList.add(dataModel2);
                    i
+=2;
                    
if(i == size){
                        
//加最后一个字
                        sentenceList.clear();
                        setSentenceList(tempList);
                        
return;
                    }

                }
else{
                    tempList.add(dataModel1);
                    
//tempList.add(dataModel2);
                    i++;
                }

            }

            
return;
        }

        
        
/**
         * 去掉出现机率大的字符:如美国,大片,影视,==
         *
         
*/

        
private void delRateBigChar(){
            List
<DataModel> tempList = new ArrayList<DataModel>();
            
int size = sentenceList.size();
            
int i = 0;
            
while(i<size){                                    
                DataModel dataModel 
= sentenceList.get(i);
                
if(dataModel.getSize()>1000){
                    i
++;
                    
continue;
                }

                tempList.add(dataModel);
                i
++;
            }

            sentenceList.clear();
            setSentenceList(tempList);
        }

        
        
/**
         * 组合字符
         *
         
*/

        
private String compisteChar(){
            List
<DataModel> tempList = new ArrayList<DataModel>();
            
int size = sentenceList.size();
            
int i = 0;        
            
while(i<size){
                
int j = i;
                DataModel dataModel 
= new DataModel("",0,0);
                StringBuffer sb 
= new StringBuffer(); 
                
boolean isFetch = false;
                
while(j<size){                
                    DataModel dataModelJ1 
= sentenceList.get(j);
                    
int location1 = dataModelJ1.getLocation();            
                    
if(j == size-1){
                        dataModel.setName(dataModel.getName() 
+ dataModelJ1.getName());
                        dataModel.setLocation(dataModel.getLocation() 
+ dataModelJ1.getLocation());
                        dataModel.setAfterLocation(dataModelJ1.getAfterLocation());
                        tempList.add(dataModel);
                        
break;
                    }

                    DataModel dataModelJ2 
= sentenceList.get(j+1);
                    
int location2 = dataModelJ2.getLocation();
                    
if(dataModel.getAfterLocation() +2 <location1 && isFetch){
                        tempList.add(dataModel);
                        
break;
                    }
                        
                    
if(location1 +1==location2){
                        isFetch 
= true;
                        sb.append(dataModelJ1.getName() 
+dataModelJ2.getName().substring(1));
                        dataModel.setName(dataModel.getName() 
+ sb.toString());
                        dataModel.setAfterLocation(dataModelJ2.getAfterLocation());
                        dataModel.setLocation(dataModel.getLocation() 
+ dataModelJ1.getLocation()+ dataModelJ2.getLocation());
                    }

                    
else if(location1 +2==location2){
                        isFetch 
= true;
                        sb.append(dataModelJ1.getName() 
+dataModelJ2.getName());
                        dataModel.setName(dataModel.getName() 
+ sb.toString());
                        dataModel.setAfterLocation(dataModelJ2.getAfterLocation());
                        dataModel.setLocation(dataModel.getLocation() 
+ dataModelJ2.getLocation());
                    }

                    
else{
                        dataModel.setName(dataModel.getName() 
+ sb.toString());
                        dataModel.setAfterLocation(dataModelJ2.getAfterLocation());
                        dataModel.setLocation(dataModel.getLocation() 
+ dataModelJ1.getLocation()+ dataModelJ2.getLocation());
                        tempList.add(dataModel);
                        
break;
                    }

                    j
+=2;
                    
if(j == size){
                        tempList.add(dataModel);
                        
break;
                    }

                }

                i
++;
            }

            
//取最长的字符串
            int tempSize = tempList.size();
            
//如果字符串都超过给定的最大值时,或是其他情况
            if(tempSize == 0)
                
return null;
            DataModel dataModelInit 
= tempList.get(0);        
            
for(int k=1;k<tempSize;k++){
                String nameInit 
= dataModelInit.getName();
                DataModel dataModel 
= tempList.get(k);
                String name 
= dataModel.getName();
                
if(name.length()>nameInit.length())
                    dataModelInit.setName(name);
            }

            logger.info(
"getName()=" +dataModelInit.getName());
            
return dataModelInit.getName();
        }

        
public String seqAction(WNews news){
            
if(logger.isDebugEnabled())
                logger.debug(
"KeyWordLucene seqAction is begin");
            charMatch(news);
            pickChar();
            delRateBigChar();
            
return compisteChar();
        }

        
public static void main(String[] args) throws Exception {
            System.out.println(
"server begin!");
            Configure.propertiesConfigure();
             path 
= Configure.getCreateBtLucenePath();
            ClassPathXmlApplicationContext appContext 
= new ClassPathXmlApplicationContext(
            
"./mysqlContext.xml");
            WNewsDAO newsDAO 
= (WNewsDAO) appContext.getBean("wNewDaoProxy");
            Bt285DAO bt285DAO 
= (Bt285DAO)appContext.getBean("bt285DAO");
            KeyWordLucene action 
= new KeyWordLucene();
            
for(int i=1;i<3;i++){
                
try {
                    Thread.sleep(
1000L);
                }
 catch (InterruptedException e) {
                    
// TODO Auto-generated catch block
                    e.printStackTrace();
                }

                Page page 
= new Page();
                logger.info(
"i=" + i);
                page.setPageIndex(i);
                page.setPageSize(
1000);
                List
<WNews> list = bt285DAO.findPageByQuery(
                        
"select t from wnews t ",null, page);
                
for(WNews news:list){
                    
//if(news.getId() != 212142)
                        
//continue;
                    logger.debug("cat news Id=" + news.getId() +"," + news.getTitle());
                    action.seqAction(news);
                }

            }

            
            
//action.batchCreate();        
            
            System.out.println(URLEncoder.encode(
"天兆","UTF-8"));//%E5%A4%A9%E5%85%86
            System.out.println("server finish!");
        }



        
public List<DataModel> getSentenceList() {
            
return sentenceList;
        }



        
public void setSentenceList(List<DataModel> sentenceList) {
            
this.sentenceList = sentenceList;
        }

}

 

测试用例:
日本H动漫下载)鬼作第五夜-杉本翔子 
[TVB2008][当狗爱上猫][国语][DVD-RMVB][04-05集]
夜来香社区官方网站
结论:60%有效.

不足之处:
 1.没有采用词典,而用二元分,速度慢,词有较大的二意性.
 2.有些词用统计方法不准确.
 3.干扰系数较大.

 续,准备采用词典