随笔-204  评论-90  文章-8  trackbacks-0
  由于lucene2.0+heritrix一书示例用的网站(http://mobile.pconline.com.cn/,http: //mobile.163.com/)改版了,书上实例不能运行,我又找了一个http://mobile.younet.com/进行开发并成功实现示 例,希望感兴趣的同学,近快实践,如果此网站也改了就又得改extractor了,哈哈!
search的Extractor代码如下,(别和书上实例相同)供大家参考:附件里有完整代码
package com.luceneheritrixbook.extractor.younet;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Date;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;

import com.luceneheritrixbook.extractor.Extractor;
import com.luceneheritrixbook.util.StringUtils;

/**
 * <p></p>
 * 
@author cnyqiao@hotmail.com
 * @date   Feb 6, 2009 
 
*/

public class ExtractYounetMoblie extends Extractor {

    @Override
    
public void extract() {
        BufferedWriter bw 
= null;
        NodeFilter title_filter 
= new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class""mo_tit"));
        NodeFilter attribute_filter 
= new AndFilter(new TagNameFilter("p"), new HasChildFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class""gn_sp1 blue1"))));
        NodeFilter img_filter 
= new AndFilter(new TagNameFilter("span"), new HasChildFilter(new TagNameFilter("img")));
        
        
//提取标题信息
        try {
            
//Parser根据过滤器返回所有满足过滤条件的节点
            
// 迭代逐渐查找
            NodeList nodeList=this.getParser().parse(title_filter);
            NodeIterator it 
= nodeList.elements();
            StringBuffer title 
= new StringBuffer();
            
while (it.hasMoreNodes()) {
                Node node 
= (Node) it.nextNode();
                String[] names 
= node.toPlainTextString().split(" ");
                
for(int i = 0; i < names.length; i++)
                    title.append(names[i]).append(
"-");
                title.append(
new Date().getTime());
                
//创建要生成的文件
                bw = new BufferedWriter(new FileWriter(new File(this.getOutputPath() + title + ".txt")));
                
//获取当前提取页的完整URL地址
                int startPos = this.getInuputFilePath().indexOf("mirror"+ 6;
                String url_seg 
= this.getInuputFilePath().substring(startPos);
                url_seg 
= url_seg.replaceAll("\\\\""/");
                String url 
= "http:/" + url_seg;
                
//写入当前提取页的完整URL地址
                bw.write(url + NEWLINE);
                bw.write(names[
0+ NEWLINE);
                bw.write(names[
1+ NEWLINE);
                
            }
            
// 重置Parser
            this.getParser().reset();
            Parser attNameParser 
= null;
            Parser attValueParser 
= null;
            
//Parser parser=new Parser("http://www.sina.com.cn");
            NodeFilter attributeName_filter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class""gn_sp1 blue1"));
            NodeFilter attributeValue_filter 
= new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class""gn_sp2"));
            String attName 
= "";
            String attValue 
= "";
            
// 迭代逐渐查找
            nodeList=this.getParser().parse(attribute_filter);
            it 
= nodeList.elements();
            
while (it.hasMoreNodes()) {                
                Node node 
= (Node) it.nextNode();
                attNameParser 
= new Parser();
                attNameParser.setEncoding(
"GB2312");
                attNameParser.setInputHTML(node.toHtml());
                NodeList attNameNodeList 
= attNameParser.parse(attributeName_filter);
                attName 
= attNameNodeList.elements().nextNode().toPlainTextString();
                
                attValueParser 
= new Parser();
                attValueParser.setEncoding(
"GB2312");
                attValueParser.setInputHTML(node.toHtml());
                NodeList attValueNodeList 
= attValueParser.parse(attributeValue_filter);
                attValue 
= attValueNodeList.elements().nextNode().toPlainTextString();
                bw.write(attName.trim() 
+ attValue.trim());
                bw.newLine();
            }
            
// 重置Parser
            this.getParser().reset();
            String imgUrl 
= "";
            String fileType 
="";
            
// 迭代逐渐查找
            nodeList=this.getParser().parse(img_filter);
            it 
= nodeList.elements();
            
while (it.hasMoreNodes()) {                
                Node node 
= (Node) it.nextNode();
                
                ImageTag imgNode 
= (ImageTag)node.getChildren().elements().nextNode();
                imgUrl 
= imgNode.getAttribute("src");                
                fileType 
= imgUrl.trim().substring(imgUrl
                        .lastIndexOf(
"."+ 1);
                
//生成新的图片的文件名
                String new_iamge_file = StringUtils.encodePassword(imgUrl, HASH_ALGORITHM) + "." + fileType;
                
//imgUrl = new HtmlPaserFilterTest().replace(new_iamge_file, "+", " ");
                
//利用miorr目录下的图片生成的新的图片
                this.copyImage(imgUrl, new_iamge_file);
                bw.write(SEPARATOR 
+ NEWLINE);
                bw.write(new_iamge_file 
+ NEWLINE);
            }
            
            
        } 
catch(Exception e) {
            e.printStackTrace();
        } 
finally {
            
try{
                
if (bw != null)
                    bw.close();
            }
catch(IOException e){
                e.printStackTrace();
            }
        }
        
    }
}
运行书上的heritrix实例,并按书上的默认设置进行抓取如下URI:(请自己分析整理)
http://mobile.younet.com/files/list_1.html
http://mobile.younet.com/files/list_2.html
http://mobile.younet.com/files/list_3.html

posted on 2009-02-09 15:44 一凡 阅读(2353) 评论(5)  编辑  收藏 所属分类: 搜索

评论:
# re: lucene2.0+heritrix示例补充[未登录] 2009-03-29 16:49 | lq

很好,强烈支持!  回复  更多评论
  
# re: lucene2.0+heritrix示例补充[未登录] 2011-01-14 00:35 | aaaaa
谢谢  回复  更多评论
  
# re: lucene2.0+heritrix示例补充[未登录] 2011-07-07 23:57 | 小龙
啊  回复  更多评论
  
# re: lucene2.0+heritrix示例补充[未登录] 2011-07-08 00:01 | 小龙
看到这篇博客就想看到救命恩人一样,楼主,我请你务必要告诉我,import com.luceneheritrixbook.util.StringUtils;
这到底是个什么包啊!我的确买了一本《开发自己的搜索引擎》我按照书上代码敲,结果找不到这个包的,我怀疑这个自己写的,不是源代码,请楼主务必要和我联系,帮我解答,谢谢了,我郁闷啊!!我的邮箱a542107840@qq.com QQ:542107840,  回复  更多评论
  
# re: lucene2.0+heritrix示例补充 2011-07-08 11:47 | willpower88
我在javaeye的博客里放了源码,你去下吧
http://willpower88.iteye.com/admin/blogs/325722  回复  更多评论
  

只有注册用户登录后才能发表评论。


网站导航: