1: /**
2: *
3: */
4: package com.taobao.cd.http.util;
5:
6: import java.io.IOException;
7: import java.io.InputStream;
8: import java.net.URL;
9: import java.net.URLConnection;
10: import java.util.ArrayList;
11: import java.util.HashMap;
12: import java.util.List;
13: import java.util.concurrent.Callable;
14: import java.util.concurrent.ExecutionException;
15: import java.util.concurrent.ExecutorService;
16: import java.util.concurrent.Executors;
17: import java.util.concurrent.FutureTask;
18:
19: import javax.imageio.ImageIO;
20: import javax.imageio.ImageReader;
21: import javax.imageio.stream.ImageInputStream;
22:
23: import org.htmlparser.Node;
24: import org.htmlparser.NodeFilter;
25: import org.htmlparser.Parser;
26: import org.htmlparser.Tag;
27: import org.htmlparser.Text;
28: import org.htmlparser.filters.TagNameFilter;
29: import org.htmlparser.tags.ImageTag;
30: import org.htmlparser.tags.LinkTag;
31: import org.htmlparser.util.NodeList;
32: import org.htmlparser.util.ParserException;
33:
34: /**
35: * @author zunyuan.jy
36: *
37: * @date 2011-10-25
38: */
39: public class HtmlStat {
40:
41: private HashMap<Class<? extends Node>, Integer> statMap; // 存放所有标签的计数器
42:
43: private ExecutorService exec;
44:
45: private List<String> textList; // 存放文本数据的队列
46:
47: private List<LinkObject> linkList; // 存放超链接对应的文本
48:
49: private List<ImageObject> imageObjectList; // 存放处理完后的图像信息
50:
51: private Parser parser;
52:
53: private ParserPool parserPool;
54:
55: private String host;
56:
57: /**
58: * 先序遍历dom树 统计全文的字数 统计全文的标签总数 统计各类标签数,统计资源放入一个map
59: * 当发现有图像标记时,可以将其放入一个队列,然后开线程去预渲染图片(只读图片头) 图片渲染完毕后,列出图片的信息 统计全文的标签文字数,非空
60: *
61: * @throws IOException
62: * @throws ParserException
63: *
64: */
65:
66: public HtmlStat() {
67: parserPool = ParserPool.getInstance();
68: textList = new ArrayList<String>(20);
69: linkList = new ArrayList<LinkObject>(20);
70: imageObjectList = new ArrayList<ImageObject>(20);
71: statMap = new HashMap<Class<? extends Node>, Integer>(100);
72: }
73:
74: private void createParser(String url) throws Exception {
75: URL u = new URL(url);
76: host = u.getProtocol() + "://" + u.getHost();
77: URLConnection con = u.openConnection();
78: con.setRequestProperty("User-Agent", HttpUtil.UA);
79: org.htmlparser.scanners.ScriptScanner.STRICT = false;
80: org.htmlparser.lexer.Lexer.STRICT_REMARKS = false;
81: parser = parserPool.borrowOne();
82: parser.setConnection(con);
83: }
84:
85: public void analyse(String url) throws Exception {
86: createParser(url);
87: exec = Executors.newCachedThreadPool();
88:
89: parse();
90:
91: exec.shutdown();
92: returnParser();
93: }
94:
95: private void parse() throws ParserException {
96: NodeFilter filter = new TagNameFilter("body");
97: NodeList nodes = parser.extractAllNodesThatMatch(filter);
98: Node node = null;
99: if (nodes != null) {
100: for (int i = 0; i < nodes.size(); i++) {
101: node = nodes.elementAt(i);
102: recursiveParse(node);
103: }
104: }
105: }
106:
107: private void recursiveParse(Node node) {
108: if (node != null) {
109: NodeList nodes = node.getChildren();
110: Node tNode = null;
111: if (nodes != null) {
112: for (int i = 0; i < nodes.size(); i++) {
113: tNode = nodes.elementAt(i);
114: if (tNode instanceof Text) {
115: if (tNode.toPlainTextString().trim().equals(""))
116: continue;
117: if (node instanceof LinkTag) {
118: LinkTag aTag = (LinkTag) node;
119: if (aTag.isHTTPLink()
120: && aTag.getAttribute("href") != null) {
121: String link = aTag.getAttribute("href");
122: LinkObject lo = new LinkObject();
123: lo.setLink(link);
124: lo.setText(tNode.toPlainTextString().trim());
125: linkList.add(lo);
126: }
127: } else {
128: textList.add(tNode.toPlainTextString().trim());
129: }
130: /*
131: * if (node instanceof LinkTag) { LinkTag aTag =
132: * (LinkTag) node; if (aTag.isHTTPLink() &&
133: * aTag.getAttribute("href") != null){ String link =
134: * aTag.getAttribute("href"); } } else if (node
135: * instanceof Div) {
136: *
137: * } else if (node instanceof TableTag || node
138: * instanceof TableHeader) {
139: *
140: * } else if (node instanceof TableRow || node
141: * instanceof TableColumn) {
142: *
143: * } else if (node instanceof ParagraphTag) {
144: *
145: * } else if (node instanceof DefinitionListBullet) {
146: *
147: * } else if (node instanceof BulletList) {
148: *
149: * } else if (node instanceof Span) {
150: *
151: * } else if (node instanceof HeadingTag) {
152: *
153: * }
154: */
155: } else if (tNode instanceof ImageTag) {
156: ImageTag img = (ImageTag) tNode;
157: String url = img.getImageURL();
158: if (!url.startsWith("http://")
159: && !url.startsWith("https://")) {
160: url = host + url;
161: }
162: final String imgSrc = url;
163: final String alt = img.getAttribute("alt");
164: FutureTask<ImageObject> task = new FutureTask<ImageObject>(
165: new Callable<ImageObject>() {
166:
167: public ImageObject call() throws Exception {
168: // TODO Auto-generated method stub
169: URL u = new URL(imgSrc);
170: String suffix = imgSrc.substring(imgSrc
171: .lastIndexOf(".") + 1);
172: InputStream is = u.openStream();
173: ImageInputStream stream = ImageIO
174: .createImageInputStream(is);
175: ImageReader ir = ImageReaderFactory
176: .getInstance()
177: .createImageReader(suffix);
178: ImageObject io = new ImageObject();
179: if (ir != null) {
180: ir.setInput(stream, true, false);
181: int w = ir.getWidth(0);
182: int h = ir.getHeight(0);
183:
184: io.setUrl(imgSrc);
185: io.setWidth(w);
186: io.setHeight(h);
187: io.setFormat(suffix);
188: io.setAlt(alt);
189:
190: }
191: return io;
192: }
193:
194: });
195: try {
196: exec.execute(task);
197: imageObjectList.add(task.get());
198: } catch (InterruptedException e) {
199: e.printStackTrace();
200: } catch (ExecutionException e) {
201: e.printStackTrace();
202: }
203:
204: } else if (tNode instanceof Tag) {
205: int x = 1;
206: if (statMap.containsKey(tNode.getClass())) {
207: x = statMap.get(tNode.getClass());
208: x++;
209: }
210: statMap.put(tNode.getClass(), x);
211: recursiveParse(tNode);
212: }
213: }
214: }
215: }
216: }
217:
218: private void returnParser() throws Exception {
219: parserPool.returnOne(parser);
220: }
221:
222: public HashMap<Class<? extends Node>, Integer> getStatMap() {
223: return statMap;
224: }
225:
226: public List<String> getTextList() {
227: return textList;
228: }
229:
230: public List<LinkObject> getLinkList() {
231: return linkList;
232: }
233:
234: public List<ImageObject> getImageObjectList() {
235: return imageObjectList;
236: }
237:
238: public static void main(String[] args) {
239: HtmlStat hs = new HtmlStat();
240: try {
241: hs.analyse("http://163.com");
242: System.out.println("该页有标签种类:");
243: System.out.println(hs.getStatMap().size());
244: System.out.println("有图像标签:");
245: System.out.println(hs.getImageObjectList().size());
246: System.out.println("有文本标签:");
247: System.out.println(hs.getTextList().size());
248: System.out.println("有超链接标签");
249: System.out.println(hs.getLinkList().size());
250: } catch (Exception e) {
251: e.printStackTrace();
252: }
253: }
254: }