|
常用链接
留言簿(3)
随笔档案(28)
文章档案(1)
我最爱的网站
搜索
最新评论
阅读排行榜
评论排行榜
Powered by: 博客园
模板提供:沪江博客
|
|
|
|
|
发新文章 |
|
|
一.问题提出:像这样的"[TVB2008][溏心风暴之家好月圆][国语][DVD-RMVB][第01集]|BT285.cn|BT下载|BT电影|BT软件"的标题,我们怎样提取出关键字"家好月圆",当然是在电影或是电视剧的范围内.
二.不用词典,你用什么方式分词.
解决方法:
1.针对此网站大部分都是BT下载,一般都是电影,电视剧,动漫.先对整个网站的标题进行Lucene索引.
2.去掉一些常用符号如:数字,字母,标志符.如"[TVB2008][溏心风暴之家好月圆][国语][DVD-RMVB][第01集]|BT285.cn|BT下载|BT电影|BT软件" 变成"溏心风暴之家好月圆 国语 第 集"
3.采用二元分法.如"溏心风暴之家好月圆 国语 第 集" 分拆成"溏心 心风 风暴 暴之 之家 家好 好月 圆 国语 第 集"
4.采用统计方法,对一些高频词去掉,如"国语 第 集" 因为"国语" 出现次数3000次(注:数据库的数据有25万条),"第"与"集"也如此.
5.对"溏心 心风 风暴 暴之 之家 家好 好月 圆 国语 第 集"进行分析并统计次数.
6.按最长连接词拼凑.按一定的比例组合划分.
7.得出"家好月圆"为其关键字
上代码:
package com.wwy.lucene;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import com.funinhand.orm.database.base.Page;
import com.funinhand.shld.dao.Bt285DAO;
import com.funinhand.shld.dao.WNewsDAO;
import com.funinhand.shld.pojo.Bt285;
import com.funinhand.shld.pojo.WNews;
import com.wwy.thread.Configure;
/** *//**
* 本代码在下面的网站中测试过并取的60%左右的有效数.
*
* http://www.bt285.cn BT下载
* http://www.yaonba.com.cn NBA中文网
* http://www.5a520.cn 小说520网
* http://www.vagaaga.cn vagaaga
*author by wangdei
*2008-6-12 上午11:16:41
*
*/
public class KeyWordLucene {
private static Logger logger = Logger.getLogger(KeyWordLucene.class);
public static String[] StopStrs = {"BT285","BT软件","BT电影","BT下载"};
public static String path = null ;
private List<DataModel> sentenceList = new ArrayList<DataModel>();
public KeyWordLucene(){
logger.debug("KeyWordLucene init");
}
public int queryMatchSize(String queryStr,String lucennePath){
Searcher searcher = null;
try {
searcher = new IndexSearcher(lucennePath);// "index"指定索引文件位置
Document doc = new Document();
StandardAnalyzer analyzer = new StandardAnalyzer(StopStrs);
// 一段简单的检索代码
QueryParser queryParser = new QueryParser("title", analyzer);
Query query = queryParser.parse(queryStr);
// 检索
Hits hits = searcher.search(query);
int size = hits.length();
searcher.close();
return size;
} catch (Exception e) {
return 0;
}finally{
if(searcher!=null){
try {
searcher.close();
}
catch (Exception e) {}
searcher = null;
}
}
}
public boolean stopWord(String word){
char charAsi = word.toCharArray()[0];
int intAsi = (int)charAsi;
if(intAsi<127)
return true;
/**//*
* 65293 -,12289 、,9670 ◆,9733 ★,12300 「,8545 Ⅱ,65281 !,65288 ( ,65306 :,
*/
int[] asiExpert = {65293,12289,9670,9733,12300,8545,65281,65288,65306};
for(int i=0;i<asiExpert.length;i++){
if(asiExpert[i] == intAsi)
return true;
}
String[] str = {"【","】","[","]","."," ","/","\\","》","《","?","的"};
for(int i=0;i<str.length;i++){
if(word.equals(str[i]))
return true;
}
return false;
}
/** *//**
* 返回整个句子的集合
* cat news Id=211562,【There.Will.Be.Blood.血色黑金.2007.Blu-ray.a1080.x264.DD51 D9】
血色=165,j=21,time=23
色黑=19,j=22,time=23
黑金=86,j=23,time=23
* @param news
* @return
*/
private void charMatch(WNews news){
sentenceList.clear();
String title = news.getTitle();
int matchSize = 0;
int length = title.length();
for(int j=0;j<length-2;j++){
if(stopWord(title.substring(j,j+1)) || stopWord(title.substring(j+1,j+2)))
continue;
DataModel dataModel = new DataModel();
long begin = System.currentTimeMillis();
matchSize =queryMatchSize(title.substring(j,j+2), path);
long needsTime = (System.currentTimeMillis()-begin);
dataModel.setName(title.substring(j,j+2));
dataModel.setSize(matchSize);
dataModel.setLocation(j);
dataModel.setNeedTime(needsTime);
//logger.debug(dataModel.toString());
sentenceList.add(dataModel);
}
}
/** *//**
* 提取字符串的.
* [古天乐刘若英最新钜献][我们这一家][粤语中字][那些发垃圾信息的不是我]
* [天乐=122,j=2,time=16,
* 刘若=84,j=4,time=16,
* 若英=84,j=5,time=0,
* 最新=10298,j=7,time=31,
* 钜献=11,j=9,time=16,
* 我们=441,j=13,time=15,
* 一家=518,j=16,time=16,
* 语中=31128,j=21,time=63,
* 中字=36065,j=22,time=78,
* 那些=17,j=26,time=16,
* 垃圾=24,j=29,time=94,
* 信息=12,j=31,time=0,
* 不是=167,j=34,time=15]
*/
private void pickChar(){
List<DataModel> tempList = new ArrayList<DataModel>();
int size = sentenceList.size();
//取二元比较字符串
int i = 0;
while(i<size){
DataModel dataModel1 = sentenceList.get(i);
if(i == size-1){
tempList.add(dataModel1);
sentenceList.clear();
setSentenceList(tempList);
return;
}
DataModel dataModel2 = sentenceList.get(i+1);
int location1 = dataModel1.getLocation();
int location2 = dataModel2.getLocation();
//如果是相邻的
if(location1 +1 == location2){
if(dataModel1.getSize()>=dataModel2.getSize())
tempList.add(dataModel1);
else
tempList.add(dataModel2);
i+=2;
if(i == size){
//加最后一个字
sentenceList.clear();
setSentenceList(tempList);
return;
}
}else{
tempList.add(dataModel1);
//tempList.add(dataModel2);
i++;
}
}
return;
}
/** *//**
* 去掉出现机率大的字符:如美国,大片,影视,==
*
*/
private void delRateBigChar(){
List<DataModel> tempList = new ArrayList<DataModel>();
int size = sentenceList.size();
int i = 0;
while(i<size){
DataModel dataModel = sentenceList.get(i);
if(dataModel.getSize()>1000){
i++;
continue;
}
tempList.add(dataModel);
i++;
}
sentenceList.clear();
setSentenceList(tempList);
}
/** *//**
* 组合字符
*
*/
private String compisteChar(){
List<DataModel> tempList = new ArrayList<DataModel>();
int size = sentenceList.size();
int i = 0;
while(i<size){
int j = i;
DataModel dataModel = new DataModel("",0,0);
StringBuffer sb = new StringBuffer();
boolean isFetch = false;
while(j<size){
DataModel dataModelJ1 = sentenceList.get(j);
int location1 = dataModelJ1.getLocation();
if(j == size-1){
dataModel.setName(dataModel.getName() + dataModelJ1.getName());
dataModel.setLocation(dataModel.getLocation() + dataModelJ1.getLocation());
dataModel.setAfterLocation(dataModelJ1.getAfterLocation());
tempList.add(dataModel);
break;
}
DataModel dataModelJ2 = sentenceList.get(j+1);
int location2 = dataModelJ2.getLocation();
if(dataModel.getAfterLocation() +2 <location1 && isFetch){
tempList.add(dataModel);
break;
}
if(location1 +1==location2){
isFetch = true;
sb.append(dataModelJ1.getName() +dataModelJ2.getName().substring(1));
dataModel.setName(dataModel.getName() + sb.toString());
dataModel.setAfterLocation(dataModelJ2.getAfterLocation());
dataModel.setLocation(dataModel.getLocation() + dataModelJ1.getLocation()+ dataModelJ2.getLocation());
}
else if(location1 +2==location2){
isFetch = true;
sb.append(dataModelJ1.getName() +dataModelJ2.getName());
dataModel.setName(dataModel.getName() + sb.toString());
dataModel.setAfterLocation(dataModelJ2.getAfterLocation());
dataModel.setLocation(dataModel.getLocation() + dataModelJ2.getLocation());
}
else{
dataModel.setName(dataModel.getName() + sb.toString());
dataModel.setAfterLocation(dataModelJ2.getAfterLocation());
dataModel.setLocation(dataModel.getLocation() + dataModelJ1.getLocation()+ dataModelJ2.getLocation());
tempList.add(dataModel);
break;
}
j+=2;
if(j == size){
tempList.add(dataModel);
break;
}
}
i++;
}
//取最长的字符串
int tempSize = tempList.size();
//如果字符串都超过给定的最大值时,或是其他情况
if(tempSize == 0)
return null;
DataModel dataModelInit = tempList.get(0);
for(int k=1;k<tempSize;k++){
String nameInit = dataModelInit.getName();
DataModel dataModel = tempList.get(k);
String name = dataModel.getName();
if(name.length()>nameInit.length())
dataModelInit.setName(name);
}
logger.info("getName()=" +dataModelInit.getName());
return dataModelInit.getName();
}
public String seqAction(WNews news){
if(logger.isDebugEnabled())
logger.debug("KeyWordLucene seqAction is begin");
charMatch(news);
pickChar();
delRateBigChar();
return compisteChar();
}
public static void main(String[] args) throws Exception {
System.out.println("server begin!");
Configure.propertiesConfigure();
path = Configure.getCreateBtLucenePath();
ClassPathXmlApplicationContext appContext = new ClassPathXmlApplicationContext(
"./mysqlContext.xml");
WNewsDAO newsDAO = (WNewsDAO) appContext.getBean("wNewDaoProxy");
Bt285DAO bt285DAO = (Bt285DAO)appContext.getBean("bt285DAO");
KeyWordLucene action = new KeyWordLucene();
for(int i=1;i<3;i++){
try {
Thread.sleep(1000L);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
Page page = new Page();
logger.info("i=" + i);
page.setPageIndex(i);
page.setPageSize(1000);
List<WNews> list = bt285DAO.findPageByQuery(
"select t from wnews t ",null, page);
for(WNews news:list){
//if(news.getId() != 212142)
//continue;
logger.debug("cat news Id=" + news.getId() +"," + news.getTitle());
action.seqAction(news);
}
}
//action.batchCreate();
System.out.println(URLEncoder.encode("天兆","UTF-8"));//%E5%A4%A9%E5%85%86
System.out.println("server finish!");
}
public List<DataModel> getSentenceList() {
return sentenceList;
}
public void setSentenceList(List<DataModel> sentenceList) {
this.sentenceList = sentenceList;
}
}
测试用例:
日本H动漫下载)鬼作第五夜-杉本翔子
[TVB2008][当狗爱上猫][国语][DVD-RMVB][04-05集]
夜来香社区官方网站
结论:60%有效.
不足之处:
1.没有采用词典,而用二元分,速度慢,词有较大的二意性.
2.有些词用统计方法不准确.
3.干扰系数较大.
续,准备采用词典
|
|