RSS内容抓取

Posted on 2008-09-13 08:33 Jason Chen 阅读(1247) 评论(0) 编辑收藏

   最近我们iwode在搞SEO，由于我们原来的网站上面并没有太注重加入这方面的内容，所以基本上我们的网站除了购物流程和品牌介绍外并没有太多其他的内容，感觉上我们的网站会好闷，所以我们想先实现一个功能，就是从一些RSS抓取内容，把正文内容转存到数据库包括所有的分页，同时去掉正文中的超链接（不链接到别人的网站），把图片转存到本地同时修改<img>标签的src路径。之后，我们可以根据数据库的内容，维护文章，改变样式，然后放到网站上。
   由于RSS本质上就是一个xml文档，所以第一步要做的就是取得网络上的一个xml文档，使其可操作。我用URL打开一个到该RSS的流，然后根据读回来的流建一个DOM，这样就可以读取xml文档中的内容。
   在一个RSS中，其中<item>标签记录着文章的信息，我关心的是其中的title，pubDate，link三个内容。title是文章的标题，pubDate是发布时间，link是该文章的链接。现在我们可以开始读取文章的内容，方法也是通过一个URL打开到文章的流，然后从流中把文章的内容(一个html)读到本地，然后再进行其他处理。
   在所读取的RSS的文章的正文内容都放在“<div class="Con"></div>”中，而分页信息则放在<div class="LP T H"></div>”中。取正文内容只需把“<div class="Con"></div>”中的内容提取出来即可。要去掉文章中的链接信息和得到图片，首先要找出正文中，所有的超链接和<img>标签，我是通过实用正则表达式来实现。要去掉链接信息，我们需要把<a>和</a>标签替换成空字符串就可以了。取得<img>标签后，要找出src属性的值，它记录了图片的URL。依据该URL，实用ImageIO，可以很方便地把图片写到本地，最后要做的得就是把就是把src的值替换为刚才写到本地的文件。
   具体的代码如下：

  1 /**
  2  *
  3  */
  4 package com.iwode.jason;
  5
  6 import java.awt.image.BufferedImage;
  7 import java.io.*;
  8 import java.net.URL;
  9 import java.text.SimpleDateFormat;
10 import java.util.ArrayList;
11 import java.util.Date;
12 import java.util.List;
13 import java.util.regex.*;
14 import org.w3c.dom.*;
15 import javax.imageio.ImageIO;
16
17 /**
18  * @author Jason
19  * @version 创建时间：2008-9-10 下午09:57:29
20  * @contact 联系方式: mailto:crjjason@163.com
21  */
22 public class RSSTester {
23
24     /**
25      * @param args
26      */
27     public static void main(String[] args) {
28     RssGetter rssGetter = new RssGetter();
29     rssGetter.getRssSource();
30     }
31 }
32
33 class RssGetter {
34     private String url;
35
36     public String getUrl() {
37     return url;
38     }
39
40     public void setUrl(String url) {
41     this.url = url;
42     }
43
44     public RssGetter() {
45     this.url = "http://news.tyloo.com/mens.rss";
46     }
47
48     /**
49      * 把RSS存到数据库
50      */
51     public void getRssSource() {
52     try {
53         URL url = new URL(this.url);
54         InputStream is = url.openStream();
55         try {
56         javax.xml.parsers.DocumentBuilder builder = javax.xml.parsers.DocumentBuilderFactory
57             .newInstance().newDocumentBuilder();
58         Document doc = builder.parse(is);
59         NodeList items = doc.getElementsByTagName("item");
60         for (int i = 0; i < items.getLength(); i++) {
61             Node item = items.item(i);
62             NodeList childs = item.getChildNodes();
63             String title = "";
64             String link = "";
65             String pubDate = "";
66             for (int j = 0; j < childs.getLength(); j++) {
67                 Node child = childs.item(j);
68                 String nodeName = child.getNodeName().trim();
69                 if (nodeName.equals("title")) {
70                 title = child.getTextContent().trim();
71                 } else if (nodeName.equals("link")) {
72                 link = child.getTextContent().trim();
73                 } else if (nodeName.equals("pubDate")) {
74                 pubDate = child.getTextContent();
75                 pubDate = pubDate.replaceAll("T", " ");
76                 }
77             }
78             Date date = new Date();
79             if (pubDate != null && !pubDate.equals("")) {
80                 SimpleDateFormat df = new SimpleDateFormat(
81                     "yyyy-MM-dd HH:mm:ss");
82                 date = df.parse(pubDate);
83             }
84             URL linkUrl = new URL(link);
85             InputStream linkIs = linkUrl.openStream();
86             StringBuilder content = new StringBuilder();
87             byte[] bytes = new byte[1024];
88             while (linkIs.read(bytes) != -1) {
89                 String bytes2utf8 = new String(bytes, "utf-8");
90                 content.append(bytes2utf8);
91             }
92             String needText = content.toString();// needText为HTML的内容
93             // 取正文内容
94             this.getLinkInfo(needText,true);
95             // 取分页信息
96             this.getPages(needText);
97             }
98         } catch (Exception e) {
99         e.printStackTrace();
100         } finally {
101         is.close();
102         }
103     } catch (IOException io) {
104         io.printStackTrace();
105     }
106     }
107
108     /**
109      * 取正文信息
110      * @param path
111      * @param isLink 标识path是链接还是html内容
112      * @throws Exception
113      */
114     private void getLinkInfo(String path, boolean isLink) throws Exception {
115     String input = path;
116     if (!isLink) {
117         URL linkUrl = new URL(path);
118         InputStream linkIs = linkUrl.openStream();
119         StringBuilder content = new StringBuilder();
120         byte[] bytes = new byte[1024];
121         while (linkIs.read(bytes) != -1) {
122         String bytes2utf8 = new String(bytes, "utf-8");
123         content.append(bytes2utf8);
124         }
125         input = content.toString();// needText为HTML的内容
126     }
127     int beginIndex = -1, endIndex = -1;
128     beginIndex = input.indexOf("<body");
129     endIndex = input.indexOf("</body>");
130     String needText = "";
131     if (beginIndex != -1 && endIndex != -1) {
132         needText = input.substring(beginIndex, endIndex);
133         beginIndex = needText.indexOf(">");
134         needText = needText.substring(beginIndex + 1);
135     }
136     String bodyText = this.parseContent(needText);// ok
137     bodyText = this.removeLinks(bodyText);// ok
138     bodyText = this.processImages(bodyText);// ok
139     System.out.println(bodyText);
140     }
141     /**
142      * 取分页信息
143      * @param input
144      * @throws Exception
145      */
146     private void getPages(String input) throws Exception {
147     int beginIndex = -1, endIndex = -1;
148     beginIndex = input.indexOf("<div class=\"LP T H\">");
149     String needText = "";
150     // 如果页面有分页
151     if (beginIndex != -1) {
152         needText = input.substring(beginIndex);
153         beginIndex = needText.indexOf(">");
154         endIndex = needText.indexOf("</div>");
155         if (endIndex != -1) {
156         needText = needText.substring(beginIndex + 1, endIndex);// 存储所有分页链接信息
157         String regEx = "<a [^>]*>[^<]*</a>";
158         Pattern p = Pattern.compile(regEx, Pattern.CASE_INSENSITIVE);
159         Matcher m = p.matcher(needText);
160         List<String> as = new ArrayList<String>();
161         while (m.find()) {
162             as.add(m.group());
163         }
164         for (String s : as) {
165             if (s.indexOf("onPage") != -1 || s.indexOf("[下一页]") != -1) {
166             continue;
167             }
168             beginIndex = -1;
169             endIndex = -1;
170             beginIndex = s.indexOf("href=");
171             String path = "";
172             if (beginIndex != -1) {
173             path = s.substring(beginIndex + 6);
174             endIndex = path.indexOf("\"");
175             if (endIndex != -1) {
176                 path = path.substring(0, endIndex);
177             } else {
178                 path = "";
179             }
180             }
181             if (!path.equals("")) {
182             this.getLinkInfo(path,false);
183             }
184         }
185         }
186     }
187     }
188
189     /**
190      * 取得文章正文内容
191      *
192      * @param contentText
193      * @return
194      * @throws Exception
195      */
196     private String parseContent(String contentText) throws Exception {
197     String content = "";
198     int beginIndex = -1;
199     int endIndex = -1;
200     beginIndex = contentText.indexOf("<div class=\"Con\">");
201     if (beginIndex != -1) {
202         content = contentText.substring(beginIndex);
203         endIndex = content.indexOf("</div>");
204         if (endIndex != -1) {
205         content = content.substring(0, endIndex + 6);
206         }
207     }
208     return content;
209     }
210
211     /**
212      * 去掉文章中的<a>
213      *
214      * @param input
215      * @return
216      */
217     private String removeLinks(String input) {
218     String output = input;
219     // 开头的<a>的正则表达式
220     String regEx = "<a [^>]*>";
221     Pattern p = Pattern.compile(regEx, Pattern.CASE_INSENSITIVE);
222     Matcher m = p.matcher(input);
223     output = m.replaceAll("");
224     // 结尾的</a>的正则表达式
225     regEx = "</a>";
226     p = Pattern.compile(regEx, Pattern.CASE_INSENSITIVE);
227     m = p.matcher(output);
228     output = m.replaceAll("");
229     return output;
230     }
231
232     /**
233      * 处理文章中的图片
234      *
235      * @param input
236      * @return
237      */
238     private String processImages(String input) {
239     String output = input;
240     String regEx = "<img [^>]*>";
241     Pattern p = Pattern.compile(regEx, Pattern.CASE_INSENSITIVE);
242     Matcher m = p.matcher(input);
243     List<String> imgs = new ArrayList<String>();
244     // 读取所有<img>标签
245     while (m.find()) {
246         imgs.add(m.group());
247     }
248     // 把图存到本地，并替换<img>标签的src值
249     for (String img : imgs) {
250         int begin = -1;
251         int end = -1;
252         String path = "";
253         if (img.indexOf("src=\"") != -1) {
254         begin = img.indexOf("src=\"");
255         path = img.substring(begin + 5);
256         end = path.indexOf("\"");
257         if (end != -1) {
258             path = path.substring(0, end);
259         } else {
260             path = "";
261         }
262         }
263         if (img.indexOf("src='") != -1) {
264         begin = img.indexOf("src='");
265         path = img.substring(begin + 5);
266         end = path.indexOf("'");
267         if (end != -1) {
268             path = path.substring(0, end);
269         } else {
270             path = "";
271         }
272         }
273         if (!path.equals("")) {
274         String filepath = this.writeImageToServer(path);
275         // System.out.println(filepath);
276         while (filepath.indexOf('\\') != -1) {
277             filepath = filepath.replace('\\', '/');
278         }
279         output = output.replaceAll(path, filepath);
280         }
281     }
282     // System.out.println(output);
283     return output;
284     }
285
286     /**
287      * 把图片写到数据库
288      *
289      * @param path
290      *                原图片路径
291      * @return 本地图片路径
292      */
293     private String writeImageToServer(String path) {
294     String filename = "";
295     try {
296         // 取图像的格式
297         int begin = path.lastIndexOf(".");
298         String suffix = path.substring(begin + 1);
299         // 读取图像
300         URL url = new URL(path);
301         BufferedImage image = ImageIO.read(url);
302         filename = "D:\\pics\\" + new Date().getTime() + "." + suffix;
303         File file = new File(filename);
304         FileOutputStream fos = new FileOutputStream(file);
305         ImageIO.write(image, suffix, fos);
306         fos.close();
307     } catch (IOException io) {
308         io.printStackTrace();
309     }
310     return filename;
311     }
312 }
313
314

新用户注册刷新评论列表


只有注册用户登录后才能发表评论。




网站导航: 博客园 IT新闻 Chat2DB C++博客博问管理

~叠~

导航

常用链接

留言簿(1)

随笔档案

搜索

最新评论

RSS内容抓取