不用词典利用Lucene取标题中的关键字

一.问题提出:像这样的"[TVB2008][溏心风暴之家好月圆][国语][DVD-RMVB][第01集]|BT285.cn|BT下载|BT电影|BT软件"的标题,我们怎样提取出关键字"家好月圆",当然是在电影或是电视剧的范围内.
二.不用词典,你用什么方式分词.

解决方法:
      1.针对此网站大部分都是BT下载,一般都是电影,电视剧,动漫.先对整个网站的标题进行Lucene索引.
      2.去掉一些常用符号如:数字,字母,标志符.如"[TVB2008][溏心风暴之家好月圆][国语][DVD-RMVB][第01集]|BT285.cn|BT下载|BT电影|BT软件" 变成"溏心风暴之家好月圆国语第集"
      3.采用二元分法.如"溏心风暴之家好月圆国语第集" 分拆成"溏心心风风暴暴之之家家好好月圆国语第集"
      4.采用统计方法,对一些高频词去掉,如"国语第集" 因为"国语" 出现次数3000次(注:数据库的数据有25万条),"第"与"集"也如此.
      5.对"溏心心风风暴暴之之家家好好月圆国语第集"进行分析并统计次数.
      6.按最长连接词拼凑.按一定的比例组合划分.
      7.得出"家好月圆"为其关键字

上代码:

package com.wwy.lucene;

import java.net.URLEncoder;

import java.util.ArrayList;

import java.util.List;

import org.apache.log4j.Logger;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.queryParser.QueryParser;

import org.apache.lucene.search.Hits;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.Searcher;

import org.springframework.context.support.ClassPathXmlApplicationContext;

import com.funinhand.orm.database.base.Page;

import com.funinhand.shld.dao.Bt285DAO;

import com.funinhand.shld.dao.WNewsDAO;

import com.funinhand.shld.pojo.Bt285;

import com.funinhand.shld.pojo.WNews;

import com.wwy.thread.Configure;

/**

* 本代码在下面的网站中测试过并取的60%左右的有效数.

* http://www.bt285.cn BT下载

* http://www.yaonba.com.cn NBA中文网

* http://www.5a520.cn 小说520网

* http://www.vagaaga.cn vagaaga

*author by wangdei

*2008-6-12 上午11:16:41

public class KeyWordLucene {

private static Logger logger = Logger.getLogger(KeyWordLucene.class);

public static String[] StopStrs = {"BT285","BT软件","BT电影","BT下载"};

public static String path = null ;

private List<DataModel> sentenceList = new ArrayList<DataModel>();

public KeyWordLucene(){

logger.debug("KeyWordLucene init");

}

public int queryMatchSize(String queryStr,String lucennePath){

Searcher searcher = null;

try {

searcher = new IndexSearcher(lucennePath);// "index"指定索引文件位置

Document doc = new Document();

StandardAnalyzer analyzer = new StandardAnalyzer(StopStrs);

// 一段简单的检索代码

QueryParser queryParser = new QueryParser("title", analyzer);

Query query = queryParser.parse(queryStr);

// 检索

Hits hits = searcher.search(query);

int size = hits.length();

searcher.close();

return size;

} catch (Exception e) {

return 0;

}finally{

if(searcher!=null){

try {

searcher.close();

}

catch (Exception e) {}

searcher = null;

}

public boolean stopWord(String word){

char charAsi = word.toCharArray()[0];

int intAsi = (int)charAsi;

if(intAsi<127)

return true;

* 65293 -,12289 、,9670 ◆,9733 ★,12300 「,8545 Ⅱ,65281 ！,65288 （ ,65306 ：,

int[] asiExpert = {65293,12289,9670,9733,12300,8545,65281,65288,65306};

for(int i=0;i<asiExpert.length;i++){

if(asiExpert[i] == intAsi)

return true;

}

String[] str = {"【","】","[","]","."," ","/","\\","》","《","?","的"};

for(int i=0;i<str.length;i++){

if(word.equals(str[i]))

return true;

}

return false;

}

/**

* 返回整个句子的集合

* cat news Id=211562,【There.Will.Be.Blood.血色黑金.2007.Blu-ray.a1080.x264.DD51 D9】

血色=165,j=21,time=23

色黑=19,j=22,time=23

黑金=86,j=23,time=23

* @param news

* @return

private void charMatch(WNews news){

sentenceList.clear();

String title = news.getTitle();

int matchSize = 0;

int length = title.length();

for(int j=0;j<length-2;j++){

if(stopWord(title.substring(j,j+1)) || stopWord(title.substring(j+1,j+2)))

continue;

DataModel dataModel = new DataModel();

long begin = System.currentTimeMillis();

matchSize =queryMatchSize(title.substring(j,j+2), path);

long needsTime = (System.currentTimeMillis()-begin);

dataModel.setName(title.substring(j,j+2));

dataModel.setSize(matchSize);

dataModel.setLocation(j);

dataModel.setNeedTime(needsTime);

//logger.debug(dataModel.toString());

sentenceList.add(dataModel);

}

/**

* 提取字符串的.

* [古天乐刘若英最新钜献][我们这一家][粤语中字][那些发垃圾信息的不是我]

* [天乐=122,j=2,time=16,

* 刘若=84,j=4,time=16,

* 若英=84,j=5,time=0,

* 最新=10298,j=7,time=31,

* 钜献=11,j=9,time=16,

* 我们=441,j=13,time=15,

* 一家=518,j=16,time=16,

* 语中=31128,j=21,time=63,

* 中字=36065,j=22,time=78,

* 那些=17,j=26,time=16,

* 垃圾=24,j=29,time=94,

* 信息=12,j=31,time=0,

* 不是=167,j=34,time=15]

private void pickChar(){

List<DataModel> tempList = new ArrayList<DataModel>();

int size = sentenceList.size();

//取二元比较字符串

int i = 0;

while(i<size){

DataModel dataModel1 = sentenceList.get(i);

if(i == size-1){

tempList.add(dataModel1);

sentenceList.clear();

setSentenceList(tempList);

return;

}

DataModel dataModel2 = sentenceList.get(i+1);

int location1 = dataModel1.getLocation();

int location2 = dataModel2.getLocation();

//如果是相邻的

if(location1 +1 == location2){

if(dataModel1.getSize()>=dataModel2.getSize())

tempList.add(dataModel1);

else

tempList.add(dataModel2);

i+=2;

if(i == size){

//加最后一个字

sentenceList.clear();

setSentenceList(tempList);

return;

}

}else{

tempList.add(dataModel1);

//tempList.add(dataModel2);

i++;

}

return;

}

/**

* 去掉出现机率大的字符:如美国,大片,影视,==

private void delRateBigChar(){

List<DataModel> tempList = new ArrayList<DataModel>();

int size = sentenceList.size();

int i = 0;

while(i<size){

DataModel dataModel = sentenceList.get(i);

if(dataModel.getSize()>1000){

i++;

continue;

}

tempList.add(dataModel);

i++;

}

sentenceList.clear();

setSentenceList(tempList);

}

/**

* 组合字符

private String compisteChar(){

List<DataModel> tempList = new ArrayList<DataModel>();

int size = sentenceList.size();

int i = 0;

while(i<size){

int j = i;

DataModel dataModel = new DataModel("",0,0);

StringBuffer sb = new StringBuffer();

boolean isFetch = false;

while(j<size){

DataModel dataModelJ1 = sentenceList.get(j);

int location1 = dataModelJ1.getLocation();

if(j == size-1){

dataModel.setName(dataModel.getName() + dataModelJ1.getName());

dataModel.setLocation(dataModel.getLocation() + dataModelJ1.getLocation());

dataModel.setAfterLocation(dataModelJ1.getAfterLocation());

tempList.add(dataModel);

break;

}

DataModel dataModelJ2 = sentenceList.get(j+1);

int location2 = dataModelJ2.getLocation();

if(dataModel.getAfterLocation() +2 <location1 && isFetch){

tempList.add(dataModel);

break;

}

if(location1 +1==location2){

isFetch = true;

sb.append(dataModelJ1.getName() +dataModelJ2.getName().substring(1));

dataModel.setName(dataModel.getName() + sb.toString());

dataModel.setAfterLocation(dataModelJ2.getAfterLocation());

dataModel.setLocation(dataModel.getLocation() + dataModelJ1.getLocation()+ dataModelJ2.getLocation());

}

else if(location1 +2==location2){

isFetch = true;

sb.append(dataModelJ1.getName() +dataModelJ2.getName());

dataModel.setName(dataModel.getName() + sb.toString());

dataModel.setAfterLocation(dataModelJ2.getAfterLocation());

dataModel.setLocation(dataModel.getLocation() + dataModelJ2.getLocation());

}

else{

dataModel.setName(dataModel.getName() + sb.toString());

dataModel.setAfterLocation(dataModelJ2.getAfterLocation());

dataModel.setLocation(dataModel.getLocation() + dataModelJ1.getLocation()+ dataModelJ2.getLocation());

tempList.add(dataModel);

break;

}

j+=2;

if(j == size){

tempList.add(dataModel);

break;

}

i++;

}

//取最长的字符串

int tempSize = tempList.size();

//如果字符串都超过给定的最大值时,或是其他情况

if(tempSize == 0)

return null;

DataModel dataModelInit = tempList.get(0);

for(int k=1;k<tempSize;k++){

String nameInit = dataModelInit.getName();

DataModel dataModel = tempList.get(k);

String name = dataModel.getName();

if(name.length()>nameInit.length())

dataModelInit.setName(name);

}

logger.info("getName()=" +dataModelInit.getName());

return dataModelInit.getName();

}

public String seqAction(WNews news){

if(logger.isDebugEnabled())

logger.debug("KeyWordLucene seqAction is begin");

charMatch(news);

pickChar();

delRateBigChar();

return compisteChar();

}

public static void main(String[] args) throws Exception {

System.out.println("server begin!");

Configure.propertiesConfigure();

path = Configure.getCreateBtLucenePath();

ClassPathXmlApplicationContext appContext = new ClassPathXmlApplicationContext(

"./mysqlContext.xml");

WNewsDAO newsDAO = (WNewsDAO) appContext.getBean("wNewDaoProxy");

Bt285DAO bt285DAO = (Bt285DAO)appContext.getBean("bt285DAO");

KeyWordLucene action = new KeyWordLucene();

for(int i=1;i<3;i++){

try {

Thread.sleep(1000L);

} catch (InterruptedException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

Page page = new Page();

logger.info("i=" + i);

page.setPageIndex(i);

page.setPageSize(1000);

List<WNews> list = bt285DAO.findPageByQuery(

"select t from wnews t ",null, page);

for(WNews news:list){

//if(news.getId() != 212142)

//continue;

logger.debug("cat news Id=" + news.getId() +"," + news.getTitle());

action.seqAction(news);

}

//action.batchCreate();

System.out.println(URLEncoder.encode("天兆","UTF-8"));//%E5%A4%A9%E5%85%86

System.out.println("server finish!");

}

public List<DataModel> getSentenceList() {

return sentenceList;

}

public void setSentenceList(List<DataModel> sentenceList) {

this.sentenceList = sentenceList;

}

测试用例:
日本H动漫下载)鬼作第五夜-杉本翔子
 [TVB2008][当狗爱上猫][国语][DVD-RMVB][04-05集]
夜来香社区官方网站
结论:60%有效.

不足之处:
1.没有采用词典,而用二元分,速度慢,词有较大的二意性.
2.有些词用统计方法不准确.
3.干扰系数较大.

续,准备采用词典

发表于 2008-08-28 14:18 BT下载与小说520 阅读(711) 评论(0) 编辑收藏

姚明的NBA

BT下载

小说520

常用链接

留言簿(3)

随笔档案(28)

文章档案(1)

我最爱的网站

搜索

最新评论

阅读排行榜

评论排行榜