请求登录人人网比较麻烦,需要记住cookie,尤其是这句代码,
httpContext.setAttribute(ClientContext.COOKIE_STORE,httpClient.getParams().getParameter("CookieStore"));试了很多遍才找到httpClient.getParams().getParameter("CookieStore"))。
主要代码如下:
package com.koyo.downloadphoto.service.impl;
import java.io.File;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.protocol.ClientContext;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.Span;
import org.htmlparser.util.NodeList;
import com.koyo.downloadphoto.service.Spider;
import com.koyo.downloadphoto.utils.HttpUtils;
import com.koyo.downloadphoto.utils.ParseUtils;
public class SpiderForRenRen extends Spider {
private Logger logger = Logger.getLogger(SpiderForRenRen.class);
@Override
public void execute() {
try {
String url = "http://photo.renren.com/photo/" + friendId
+ "/album/relatives";
// ===================请求登录======================================================
HttpPost post = new HttpPost("http://www.renren.com/PLogin.do");
// 添加POST参数
List<NameValuePair> nvps = new ArrayList<NameValuePair>();
nvps.add(new BasicNameValuePair("email", loginName));
nvps.add(new BasicNameValuePair("password", loginPassword));
post.setEntity(new UrlEncodedFormEntity(nvps, "utf-8"));
HttpResponse response = httpClient.execute(post, httpContext);
HttpEntity entity = response.getEntity();
if (entity != null) {
InputStream is = entity.getContent();
// 使用响应中的编码来解释响应的内容
String html1 = IOUtils.toString(is);
LinkTag linkTag1 = ParseUtils.parseTag(html1, LinkTag.class);
String url1 = linkTag1.getLink();
HttpGet get = new HttpGet(url1);
response = httpClient.execute(get, httpContext);
// 保存cookie
httpContext.setAttribute(ClientContext.COOKIE_STORE, httpClient
.getParams().getParameter("CookieStore"));
EntityUtils.consume(response.getEntity());
System.out.println("账号:" + loginName);
System.out.println("密码:" + loginPassword);
}
// ===================获取相册页面信息===================================================
// 根据URL地址,获取网页内容
String html = HttpUtils.getHtml(httpClient, httpContext, url);
if (html == null) {
logger.error("无法获取【" + url + "】网址的内容");
throw new RuntimeException("无法获取【" + url + "】网址的内容");
}
//获取好友名
Parser parser = new Parser();
parser.setInputHTML(html);
AndFilter andFilter = new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "nav-tabs"));
NodeList nodes = parser.parse(andFilter);
String tempString = nodes.toHtml();
LinkTag tempTag = ParseUtils.parseTag(tempString, LinkTag.class);
String tempName = tempTag.getLinkText();
// String friendName = tempName.substring(tempName.indexOf("\n")+1,tempName.lastIndexOf("\n"));
String friendName = tempName.trim();
//获取相册名
String albumName;
List<LinkTag> linkTags = ParseUtils.parseTags(html, LinkTag.class,
"class", "album-cover");
List<Span> spans = ParseUtils.parseTags(html, Span.class,"class","album-name");
if (linkTags != null) {
for (int i=0; i<linkTags.size(); i++) {
tempName = spans.get(i).getStringText();
//由于头像相册前还有一个<span class="userhead"/> 故不能使用tempName.trim()
albumName = tempName.substring(tempName.lastIndexOf("\n")+1);
url = linkTags.get(i).getLink();
// 根据URL地址,获取网页内容
html = HttpUtils.getHtml(httpClient, httpContext, url);
if (html == null) {
logger.error("无法获取【" + url + "】网址的内容");
throw new RuntimeException("无法获取【" + url + "】网址的内容");
}
List<LinkTag> linkTags2 = ParseUtils.parseTags(html,
LinkTag.class, "class", "picture");
if (linkTags2 != null) {
for (LinkTag linkTag2 : linkTags2) {
url = linkTag2.getLink();
// 根据URL地址,获取网页内容
html = HttpUtils.getHtml(httpClient, httpContext,
url);
if (html == null) {
logger.error("无法获取【" + url + "】网址的内容");
throw new RuntimeException("无法获取【" + url
+ "】网址的内容");
}
// 网页中所包含的图片,并下载到upload目录,然后创建Attachment对象
ImageTag imageTag = ParseUtils.parseTag(html,
ImageTag.class, "id", "photo");
if (imageTag != null) {
// 得到图片所在的路径目录
// String baseUrl = url.substring(0,
// url.lastIndexOf("/") + 1);
// 这个是<img>标签中的src的值
String imageUrl = imageTag.getImageURL();
String photoName = imageUrl.substring(imageUrl
.lastIndexOf("/"));
// 图片的绝对路径
// String absoluteUrl = baseUrl + imageUrl;
// : "文章标题/xxx.jpg"
String imageName = friendName + "/" +albumName + photoName;
// 把图片保存到upload目录
// 首先确定,保存到本地的图片的路径
String imageLocalFile = "D:/PhotosForRenRen/"
+ imageName;
// 如果图片已经被下载到本地,则不再下载
if (!new File(imageLocalFile).exists()) {
// 下载图片的信息
byte[] image = HttpUtils.getImage(
httpClient, httpContext, imageUrl);
// 直接使用new
// FileOutputStream(imageLocalFile)这种方式,创建一个
// 文件输出流,存在的问题就是:如果这个文件所在的目录不存在,则创建不了
// 输出流,会抛出异常!
// 所以,使用辅助的工具类来创建一个文件输出流:FileUtils.openOutputStream(new
// File(imageLocalFile))
// 通过这个方法,当文件所在的父目录不存在的时候,将自动创建其所有的父目录
IOUtils.write(image, FileUtils
.openOutputStream(new File(
imageLocalFile)));
System.out.println("图片【" + imageUrl
+ "】已下载");
}
}
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}