手上的项目需要到网络上去爬取wsdl文件,来充实服务网络。一开始的想法是自己采用网络爬虫的想法,可以自己编写一个只针对网页上的超链接进行遍历的爬虫;或者可以对现有的爬虫器进行改进,使其只关注网页的链接部分,至于网页的内容不去理会,这样可以避免爬虫的效率和数据流量过大的问题。后来发现Goolge的高级搜索可以对文件的类型进行约束,所以决定利用Google的搜索引擎来实现该目标。利用Google搜索引擎进行检索,能够有很高的查询效率和很高的查全率与查准率,同时还可以通过关键字进行限制查询。
利用Google进行检索主要可以通过以下两种方法实现:
1.使用Google API,可以直接设定查询的条件,并且得到结构简单的查询结果。但是Google API目前好像已经停止使用了,非常遗憾。
2.通过像Google发送页面请求消息来实现查询。这就需要设计到两个问题:首先要对Google的查询请求的格式有一定了解,如何把查询的约束条件信息包含在请求消息中;其次,通过页面请求查询,得到的是html页面代码,所以要对页面中的内容进行解析、提取,过滤出有用的超链接结果。
这里主要介绍一下后一种方法,但从理论上讲前一共方法可加科学和简便。
一、Google的查询页面请求消息格式
http://www.google.com/search?num=50&hl=en&lr=&newwindow=1&as_qdr=all&q=a++filetype:wsdl&start=50&sa=N
其中 num为每页的结果个数;hl表示语言;q=后面为关键字;filetype为文件格式;start为开始的结果标号。
二、发送页面请求
private String searchPage(String keyString, String filetype, int start) throws IOException
{
URL url = buildurl(keyString, filetype, start);
String htmlString = new String();
HttpURLConnection http = (HttpURLConnection) url.openConnection();
http.setRequestProperty("User-Agent", "Mozilla/5.0");
http.connect();
InputStream urlstream = http.getInputStream();
InputStreamReader inputStreamReader = new InputStreamReader(urlstream);
BufferedReader bufReader = new BufferedReader(inputStreamReader);
String currentLine = null;
while ((currentLine = bufReader.readLine()) != null)
htmlString += currentLine;
return htmlString;
}
三、对返回页面内容进行解析
这里使用了HTML Parser进行解析,代码如下:
private List<String> listURL(String html, String filetype)
{
List<String> urlList = new ArrayList<String>();
Parser parser = Parser.createParser(html, "UTF-8");
NodeList nodeList = null;
try {
nodeList = parser.extractAllNodesThatMatch(
new NodeFilter() {
private static final long serialVersionUID = 0L;
public boolean accept(Node node) {
if (node instanceof LinkTag)
return true;
else
return false;
}
});
} catch (ParserException e) {
e.printStackTrace();
}
for (int i=0; i<nodeList.size(); i++) {
LinkTag node = (LinkTag) nodeList.elementAt(i);
if(node.extractLink().endsWith("." + filetype) && !node.getStringText().equalsIgnoreCase("Similar pages"))
urlList.add(node.extractLink());
}
return urlList;
}
四、附完整程序代码
package cn.edu.tju.cs.ikse.sn.spider;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class GoogleSearch {
private final int intmax = (int) (Math.pow(2, 31) - 1);
private List<String> splitString(String keyString)
{
if(keyString == null)
return null;
String[] keyWords = keyString.split(" ");
List<String> keyList = new ArrayList<String>();
for(int i=0; i<keyWords.length; i++)
if(keyWords[i].length() != 0)
keyList.add(keyWords[i]);
return keyList;
}
private URL buildurl(String keyString, String filetype, int start)
{
String urlString = "http://www.google.com/search?num=100&hl=en&lr=&newwindow=1&as_qdr=all&q=";
if(splitString(keyString) != null)
{
Iterator<String> keyIt = splitString(keyString).iterator();
while(keyIt.hasNext())
urlString += keyIt.next() + "+";
}
urlString += "filetype:" + filetype + "&start=" + start + "&sa=N";
URL url = null;
try {
url = new URL(urlString);
} catch (MalformedURLException e) {
System.out.println("String to URL Errors!");
e.printStackTrace();
}
return url;
}
private String searchPage(String keyString, String filetype, int start) throws IOException
{
URL url = buildurl(keyString, filetype, start);
String htmlString = new String();
HttpURLConnection http = (HttpURLConnection) url.openConnection();
http.setRequestProperty("User-Agent", "Mozilla/5.0");
http.connect();
InputStream urlstream = http.getInputStream();
InputStreamReader inputStreamReader = new InputStreamReader(urlstream);
BufferedReader bufReader = new BufferedReader(inputStreamReader);
String currentLine = null;
while ((currentLine = bufReader.readLine()) != null)
htmlString += currentLine;
return htmlString;
}
private List<String> listURL(String html, String filetype)
{
List<String> urlList = new ArrayList<String>();
Parser parser = Parser.createParser(html, "UTF-8");
NodeList nodeList = null;
try {
nodeList = parser.extractAllNodesThatMatch(
new NodeFilter() {
private static final long serialVersionUID = 0L;
public boolean accept(Node node) {
if (node instanceof LinkTag)
return true;
else
return false;
}
});
} catch (ParserException e) {
e.printStackTrace();
}
for (int i=0; i<nodeList.size(); i++) {
LinkTag node = (LinkTag) nodeList.elementAt(i);
if(node.extractLink().endsWith("." + filetype) && !node.getStringText().equalsIgnoreCase("Similar pages"))
urlList.add(node.extractLink());
}
return urlList;
}
public List<String> search(String keyString, String filetype, int num)
{
List<String> urlList = new ArrayList<String>();
int start = 0;
while(urlList.size() < num)
{
String html = null;
try {
html = searchPage(keyString, filetype, start);
start += 100;
} catch (IOException e) {
e.printStackTrace();
}
List<String> urlListOfPage = listURL(html, filetype);
if(urlListOfPage.size() == 0)
{
System.out.println("The maximum number of the results is " + urlList.size());
return urlList;
}
urlList.addAll(urlListOfPage);
}
while(urlList.size() > num)
urlList.remove(urlList.size()-1);
return urlList;
}
public List<String> search(String keyString, String filetype)
{
return search(keyString, filetype, intmax);
}
public static void main(String[] args) {
GoogleSearch googleSearch = new GoogleSearch();
List<String> re = googleSearch.search("book", "owl", 1000);
System.out.println(re.size());
for(int i=0;i<re.size();i++)
System.out.println(re.get(i));
}
}
posted on 2008-08-01 20:37
胖胖泡泡 阅读(163)
评论(0) 编辑 收藏