今天晚上,帮我一个同门师兄,解决一下问题.
题目是,抓取一个网站的所以页面,并抓下这些页码的所有网址.
代码如下:
package com.hwp.test;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class SearchEngine
{
private Map<String, List<String>> pageNameUrls;
public SearchEngine()
{
pageNameUrls = new HashMap<String, List<String>>();
}
private String getContent(String httpUrl)
{
String htmlCode = "";
try
{
InputStream in;
URL url = new java.net.URL(httpUrl);
HttpURLConnection connection = (HttpURLConnection) url
.openConnection();
connection = (HttpURLConnection) url.openConnection();
connection.setRequestProperty("User-Agent", "Mozilla/4.0");
connection.connect();
in = connection.getInputStream();
byte[] buffer = new byte[512];
int length = -1;
while ((length = in.read(buffer, 0, 512)) != -1)
{
htmlCode += new String(buffer, 0, length);
}
}
catch (Exception e)
{}
if (htmlCode == null)
{
return "";
}
return htmlCode;
}
private List<String> getPageUrls(String page)
{
List<String> urls = new ArrayList<String>();
String content = this.getContent(page);
String reg = "http://([\\w-]+\\.)+[\\w-]+(/[\\w- ./?%&=]*)?";
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(content);
String url = "";
while (matcher.find())
{
url = matcher.group();
if (!urls.contains(url))
{
urls.add(url);
}
}
return urls;
}
public void test(String url, String baseUrl)
{
String content = this.getContent(url);
// System.out.println(content);
String reg = "(" + baseUrl
+ "(/[\\w-]+)*(/[\\w-]+\\.(htm|html|xhtml|jsp|asp|php)))";
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(content);
while (matcher.find())
{
String tempUrl = matcher.group();
if (!this.pageNameUrls.containsKey(tempUrl))
{
//System.out.println(tempUrl);
this.pageNameUrls.put(tempUrl, this.getPageUrls(tempUrl));
test(tempUrl, baseUrl);
}
}
}
public static void main(String[] args)
{
String url = "http://www.blogjava.net";
String baseUrl = "http://www.blogjava.net";
SearchEngine se = new SearchEngine();
se.test(url, baseUrl);
Map<String, List<String>> map= se.pageNameUrls;
Set<Map.Entry<String, List<String>>> set = map.entrySet();
for(Map.Entry<String, List<String>> entry: set)
{
System.out.println(entry.getKey());
System.out.println(entry.getValue());
}
}
}