1: /**
2: *
3: */
4: package com.taobao.cd.http.image;
5:
6: import java.io.IOException;
7: import java.io.InputStream;
8: import java.net.URL;
9: import java.net.URLConnection;
10: import java.util.HashSet;
11: import java.util.Set;
12: import java.util.concurrent.ExecutorService;
13: import java.util.concurrent.Executors;
14:
15: import javax.imageio.ImageIO;
16: import javax.imageio.ImageReader;
17: import javax.imageio.stream.ImageInputStream;
18:
19: import org.htmlparser.NodeFilter;
20: import org.htmlparser.Parser;
21: import org.htmlparser.filters.TagNameFilter;
22: import org.htmlparser.tags.ImageTag;
23: import org.htmlparser.util.NodeList;
24:
25: import com.taobao.cd.http.util.HttpUtil;
26: import com.taobao.cd.http.util.ImageReaderFactory;
27: import com.taobao.cd.http.util.ParserPool;
28:
29: /**
30: * 这是一个图片抓取器,通过给定url抓取该页面的所有img 可定制,过滤 ver 1.0: 只是初级实现图片抓取
31: *
32: * @author zunyuan.jy
33: *
34: * @date 2011-11-2
35: */
36: public class ImageSpider {
37:
38: private Set<String> imgSet; // 用于记录已经下载过的图像url
39:
40: private int customedSize; // 支持定制的图像大小,单位是KB
41:
42: public ImageSpider() {
43: this(0);
44: }
45:
46: public ImageSpider(int s) {
47: this.customedSize = s;
48: imgSet = new HashSet<String>();
49: }
50:
51: /**
52: * 抓取指定url页面的所有图像数据
53: *
54: * @param url
55: * 页面url
56: * @param path
57: * 要将图片保存的路径
58: * @throws Exception
59: */
60: public void crawl(String url, final String path) throws Exception {
61: URL u = new URL(url);
62: URLConnection con = (u.openConnection());
63: con.setRequestProperty("User-Agent", HttpUtil.UA);
64: org.htmlparser.scanners.ScriptScanner.STRICT = false;
65: org.htmlparser.lexer.Lexer.STRICT_REMARKS = false;
66: Parser parser = ParserPool.getInstance().borrowOne();
67: parser.setConnection(con);
68:
69: NodeFilter filter = new TagNameFilter("img");
70: NodeList nodes = parser.extractAllNodesThatMatch(filter);
71: ImageTag node = null;
72: String imgSrc;
73: String suffix;
74: if (nodes != null) {
75: for (int i = 0; i < nodes.size(); i++) {
76: node = (ImageTag) nodes.elementAt(i);
77: imgSrc = node.getImageURL();
78:
79: if (!imgSet.contains(imgSrc)) {
80: imgSet.add(imgSrc);
81: suffix = imgSrc.substring(imgSrc.lastIndexOf(".") + 1);
82: if (suffix.equalsIgnoreCase(ImageUtil.JPG)
83: || suffix.equalsIgnoreCase(ImageUtil.PNG)
84: || suffix.equalsIgnoreCase(ImageUtil.GIF)
85: || suffix.equalsIgnoreCase(ImageUtil.BMP)) {
86: URL uu = new URL(imgSrc);
87: if (customedSize == 0 || filterSize(uu, suffix)) {
88: ImageUtil.writeImg(uu, path, suffix);
89: }
90: } else {
91: System.err.println(suffix
92: + ":img format not supported!");
93: }
94: }
95: }
96: }
97: }
98:
99: private boolean filterSize(URL u, String suffix) throws IOException {
100: InputStream is = u.openStream();
101: ImageInputStream stream = ImageIO.createImageInputStream(is);
102: ImageReader ir = ImageReaderFactory.getInstance().createImageReader(
103: suffix);
104: if (ir != null) {
105: ir.setInput(stream, true, false);
106: int w = ir.getWidth(0);
107: int h = ir.getHeight(0);
108: if (w * h < customedSize * 1024 * 3 + 100) {
109: return true;
110: } else {
111: return false;
112: }
113: } else {
114: System.err.println(u.getFile() + ":read img header error!");
115: return false;
116: }
117: }
118:
119: /**
120: * @param args
121: */
122: public static void main(String[] args) {
123: // TODO Auto-generated method stub
124:
125: }
126: }