网络爬虫——人人好友相册下载

　　请求登录人人网比较麻烦，需要记住cookie，尤其是这句代码，
httpContext.setAttribute(ClientContext.COOKIE_STORE,httpClient.getParams().getParameter("CookieStore"));
试了很多遍才找到httpClient.getParams().getParameter("CookieStore"))。

主要代码如下：

package com.koyo.downloadphoto.service.impl;

import java.io.File;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.protocol.ClientContext;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.Span;
import org.htmlparser.util.NodeList;

import com.koyo.downloadphoto.service.Spider;
import com.koyo.downloadphoto.utils.HttpUtils;
import com.koyo.downloadphoto.utils.ParseUtils;

public class SpiderForRenRen extends Spider {

private Logger logger = Logger.getLogger(SpiderForRenRen.class);

@Override
public void execute() {

try {

String url = "http://photo.renren.com/photo/" + friendId
+ "/album/relatives";

   // ===================请求登录======================================================
   HttpPost post = new HttpPost("http://www.renren.com/PLogin.do");
   // 添加POST参数
   List<NameValuePair> nvps = new ArrayList<NameValuePair>();
   nvps.add(new BasicNameValuePair("email", loginName));
   nvps.add(new BasicNameValuePair("password", loginPassword));
   post.setEntity(new UrlEncodedFormEntity(nvps, "utf-8"));
   HttpResponse response = httpClient.execute(post, httpContext);

   HttpEntity entity = response.getEntity();
   if (entity != null) {
    InputStream is = entity.getContent();
    // 使用响应中的编码来解释响应的内容
    String html1 = IOUtils.toString(is);
    LinkTag linkTag1 = ParseUtils.parseTag(html1, LinkTag.class);
    String url1 = linkTag1.getLink();

    HttpGet get = new HttpGet(url1);
    response = httpClient.execute(get, httpContext);
    // 保存cookie
    httpContext.setAttribute(ClientContext.COOKIE_STORE, httpClient
      .getParams().getParameter("CookieStore"));
    EntityUtils.consume(response.getEntity());

    System.out.println("账号：" + loginName);
    System.out.println("密码：" + loginPassword);
   }
   // ===================获取相册页面信息===================================================
   // 根据URL地址，获取网页内容
   String html = HttpUtils.getHtml(httpClient, httpContext, url);

   if (html == null) {
    logger.error("无法获取【" + url + "】网址的内容");
    throw new RuntimeException("无法获取【" + url + "】网址的内容");
   }

   //获取好友名
   Parser parser = new Parser();
   parser.setInputHTML(html);
   AndFilter andFilter = new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "nav-tabs"));
   NodeList nodes = parser.parse(andFilter);
   String tempString = nodes.toHtml();
   LinkTag tempTag = ParseUtils.parseTag(tempString, LinkTag.class);
   String tempName = tempTag.getLinkText();
//   String friendName = tempName.substring(tempName.indexOf("\n")+1,tempName.lastIndexOf("\n"));
   String friendName = tempName.trim();

   //获取相册名
   String albumName;

   List<LinkTag> linkTags = ParseUtils.parseTags(html, LinkTag.class,
     "class", "album-cover");
   List<Span> spans = ParseUtils.parseTags(html, Span.class,"class","album-name");

   if (linkTags != null) {
    for (int i=0; i<linkTags.size(); i++) {
     tempName = spans.get(i).getStringText();
     //由于头像相册前还有一个<span class="userhead"/> 故不能使用tempName.trim()
     albumName = tempName.substring(tempName.lastIndexOf("\n")+1);
     url = linkTags.get(i).getLink();
     // 根据URL地址，获取网页内容
     html = HttpUtils.getHtml(httpClient, httpContext, url);

     if (html == null) {
      logger.error("无法获取【" + url + "】网址的内容");
      throw new RuntimeException("无法获取【" + url + "】网址的内容");
     }

     List<LinkTag> linkTags2 = ParseUtils.parseTags(html,
       LinkTag.class, "class", "picture");
     if (linkTags2 != null) {
      for (LinkTag linkTag2 : linkTags2) {
       url = linkTag2.getLink();
       // 根据URL地址，获取网页内容
       html = HttpUtils.getHtml(httpClient, httpContext,
         url);

       if (html == null) {
        logger.error("无法获取【" + url + "】网址的内容");
        throw new RuntimeException("无法获取【" + url
          + "】网址的内容");
       }

       // 网页中所包含的图片，并下载到upload目录，然后创建Attachment对象
       ImageTag imageTag = ParseUtils.parseTag(html,
         ImageTag.class, "id", "photo");
       if (imageTag != null) {

        // 得到图片所在的路径目录
        // String baseUrl = url.substring(0,
        // url.lastIndexOf("/") + 1);

        // 这个是<img>标签中的src的值
        String imageUrl = imageTag.getImageURL();
        String photoName = imageUrl.substring(imageUrl
          .lastIndexOf("/"));
        // 图片的绝对路径
        // String absoluteUrl = baseUrl + imageUrl;

// : "文章标题/xxx.jpg"
String imageName = friendName + "/" +albumName + photoName;

        // 把图片保存到upload目录
        // 首先确定，保存到本地的图片的路径
        String imageLocalFile = "D:/PhotosForRenRen/"
          + imageName;

        // 如果图片已经被下载到本地，则不再下载
        if (!new File(imageLocalFile).exists()) {
         // 下载图片的信息
         byte[] image = HttpUtils.getImage(
           httpClient, httpContext, imageUrl);
         // 直接使用new
         // FileOutputStream(imageLocalFile)这种方式，创建一个
         // 文件输出流，存在的问题就是：如果这个文件所在的目录不存在，则创建不了
         // 输出流，会抛出异常！
         // 所以，使用辅助的工具类来创建一个文件输出流:FileUtils.openOutputStream(new
         // File(imageLocalFile))
         // 通过这个方法，当文件所在的父目录不存在的时候，将自动创建其所有的父目录
         IOUtils.write(image, FileUtils
           .openOutputStream(new File(
             imageLocalFile)));
         System.out.println("图片【" + imageUrl
           + "】已下载");
        }
       }
      }
     }
    }
   }

  } catch (Exception e) {
   e.printStackTrace();
  }

}
}

posted on 2012-02-23 13:42 aya000 阅读(503) 评论(0) 编辑收藏

常用链接

留言簿

随笔档案

搜索

最新评论

阅读排行榜

评论排行榜


只有注册用户登录后才能发表评论。




网站导航: 博客园博客园最新博文博问管理