梦幻之旅

DEBUG - 天道酬勤

:: 首页 :: 新随笔 :: 联系 :: 聚合

:: 管理 ::

671 随笔 :: 6 文章 :: 256 评论 :: 0 Trackbacks

抓网页

今天晚上,帮我一个同门师兄,解决一下问题.
题目是,抓取一个网站的所以页面,并抓下这些页码的所有网址.
代码如下:

package com.hwp.test;

import java.io.InputStream;

import java.net.HttpURLConnection;

import java.net.URL;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import java.util.Set;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

public class SearchEngine

{

private Map<String, List<String>> pageNameUrls;

public SearchEngine()

{

pageNameUrls = new HashMap<String, List<String>>();

}

private String getContent(String httpUrl)

{

String htmlCode = "";

try

{

InputStream in;

URL url = new java.net.URL(httpUrl);

HttpURLConnection connection = (HttpURLConnection) url

.openConnection();

connection = (HttpURLConnection) url.openConnection();

connection.setRequestProperty("User-Agent", "Mozilla/4.0");

connection.connect();

in = connection.getInputStream();

byte[] buffer = new byte[512];

int length = -1;

while ((length = in.read(buffer, 0, 512)) != -1)

{

htmlCode += new String(buffer, 0, length);

}

catch (Exception e)

{}

if (htmlCode == null)

{

return "";

}

return htmlCode;

}

private List<String> getPageUrls(String page)

{

List<String> urls = new ArrayList<String>();

String content = this.getContent(page);

String reg = "http://([\\w-]+\\.)+[\\w-]+(/[\\w- ./?%&=]*)?";

Pattern pattern = Pattern.compile(reg);

Matcher matcher = pattern.matcher(content);

String url = "";

while (matcher.find())

{

url = matcher.group();

if (!urls.contains(url))

{

urls.add(url);

}

return urls;

}

public void test(String url, String baseUrl)

{

String content = this.getContent(url);

// System.out.println(content);

String reg = "(" + baseUrl

+ "(/[\\w-]+)*(/[\\w-]+\\.(htm|html|xhtml|jsp|asp|php)))";

Pattern pattern = Pattern.compile(reg);

Matcher matcher = pattern.matcher(content);

while (matcher.find())

{

String tempUrl = matcher.group();

if (!this.pageNameUrls.containsKey(tempUrl))

{

//System.out.println(tempUrl);

this.pageNameUrls.put(tempUrl, this.getPageUrls(tempUrl));

test(tempUrl, baseUrl);

}

public static void main(String[] args)

{

String url = "http://www.blogjava.net";

String baseUrl = "http://www.blogjava.net";

SearchEngine se = new SearchEngine();

se.test(url, baseUrl);

Map<String, List<String>> map= se.pageNameUrls;

Set<Map.Entry<String, List<String>>> set = map.entrySet();

for(Map.Entry<String, List<String>> entry: set)

{

System.out.println(entry.getKey());

System.out.println(entry.getValue());

}

posted on 2008-07-14 23:24 HUIKK 阅读(432) 评论(0) 编辑收藏所属分类: Regular Exp

新用户注册刷新评论列表


只有注册用户登录后才能发表评论。




网站导航: 博客园博客园最新博文博问管理
相关文章: java 正则抓网页正则表达式抓取网页面上所有图片

梦幻之旅

公告

常用链接

留言簿(21)

随笔分类(644)

随笔档案(669)

文章档案(6)

最新随笔

积分与排名

最新评论

阅读排行榜

评论排行榜