如何分析网页-----使用HtmlParser(1)

HTML Parser 简述:这是一个在 SourceForge.net 上比较活跃的项目之一，目前的最新版本是 1.6 发行版, (我现在用在自己NBA网站上的也是1.6).他是一个对现有的 HTML 进行分析的快速实时的解析器，事实上在应用过程中你更为惊叹于 HTML Parser 给你带来一些周到的处理。他主要用在这几个方面:
文本信息抽取，
链接提取，用于自动给页面的链接文本加上链接的标签
资源提取，例如对一些图片、声音的资源的处理
链接检查，用于检查HTML中的链接是否有效
页面内容的监控
呵呵.废话少说:)上代码.

public class BaseAction {

public static final Logger logger = Logger.getLogger(BaseAction.class);

public String keyWords = "姚明|姚明NBA";

public static NodeList getAllNodeList(String urlOrfile, NodeFilter filter) {

if (logger.isDebugEnabled())

logger.debug("BaseAction getAllNodeList(" + urlOrfile + ")");

Parser parser;

try {

parser = new Parser(urlOrfile);

parser.setEncoding(Constent.Encode);

NodeList list = parser.parse(filter);

return list;

} catch (ParserException e) {

// TODO Auto-generated catch block

e.printStackTrace();

return null;

}

/**

* 取link 与textnode 返回的是href连接

* @param file

* @param filter

* @return

public List<String> parseLink(String file, NodeFilter filter) {

if (logger.isDebugEnabled())

logger.debug("BaseAction parseLink(" + file + ")");

List<String> hrefList = new ArrayList<String>();

try {

NodeList nodelist = getAllNodeList(file, filter);

if(nodelist==null)

return null;

Node[] nodes = nodelist.toNodeArray();

String line = "";

for (int i = 0; i < nodes.length; i++) {

Node node = nodes[i];

if (node instanceof TextNode) {

TextNode textnode = (TextNode) node;

line = textnode.getText();

logger.debug("textnode=" + line);

} else if (node instanceof LinkTag) {

LinkTag link = (LinkTag) node;

line = link.getLink();

logger.debug("link=" + line);

}

if (HttpParserUtil.isTrimEmpty(line))

continue;

hrefList.add(line);

}

} catch (Exception e) {

// TODO: handle exception

e.printStackTrace();

}

return hrefList;

}

/**

* 取link 与textnode 返回的是href连接

* @param file

* @param filter

* @return

public Map<String, String> parseLinkWithText(String file,

NodeFilter filter, Pattern pHtml, Pattern pPhp) {

if (logger.isDebugEnabled())

logger.debug("SinaAction parseLinkWithText(" + file + ")");

Map<String, String> map = new HashMap<String, String>();

List<String> list = new ArrayList<String>();

try {

NodeList nodelist = getAllNodeList(file, filter);

if(nodelist==null)

return null;

Node[] nodes = nodelist.toNodeArray();

String line = "";

for (int i = 0; i < nodes.length; i++) {

Node node = nodes[i];

if (node instanceof TextNode) {

TextNode textnode = (TextNode) node;

line = textnode.getText();

if (HttpParserUtil.isTrimEmpty(line))

continue;

if (logger.isDebugEnabled())

logger.debug("textnode=" + line);

list.add(line);

} else if (node instanceof LinkTag) {

LinkTag link = (LinkTag) node;

line = link.getLink();

if (HttpParserUtil.isTrimEmpty(line))

continue;

if (logger.isDebugEnabled())

logger.debug("link=" + line);

list.add(line);

}

int endPostion = list.size();

for (int i = 0; i < endPostion; i++) {

String getCurr = list.get(i);

Matcher mHtml = pHtml.matcher(getCurr);

Matcher mPhp = pPhp.matcher(getCurr);

if ((mHtml.matches() == true || mPhp.matches() == true)

&& i < (endPostion - 1)) {

String getNext = list.get(i + 1);

Matcher mHtmlNext = pHtml.matcher(getNext);

Matcher mPhpNext = pPhp.matcher(getNext);

if ((mHtml.matches() == true && mHtmlNext.matches() == false)

|| (mPhp.matches() == true && mPhpNext.matches() == false)) {

map.put(getCurr, getNext);

i = i + 1;

} else {

}

} catch (Exception e) {

// TODO: handle exception

e.printStackTrace();

}

return map;

}

/**

* 分析内容

* @param list

* @return

public String parserContent(NodeList list) {

return parserContent(list, false);

}

public String parserContent(NodeList list, boolean isCreateFile) {

return parserContent(list,isCreateFile,list.size()+1);

}

public String parserContent(NodeList list, int listIndex) {

return parserContent(list,false,listIndex);

}

public String parserContent(NodeList list, boolean isCreateFile,int listIndex) {

if (logger.isDebugEnabled())

logger.debug("BaseAction parserContent()");

StringBuffer content = new StringBuffer();

if(list.size() < listIndex){//说明是整个取出进行rex

for (int i = 0; i < list.size(); i++) {

Node node = list.elementAt(i);

NodeList sublist = node.getChildren();

if (sublist == null)

continue;

Node[] listNode = sublist.toNodeArray();

for (Node inNode : listNode) {

if (HttpParserUtil.isTrimEmpty(inNode.getText()))

continue;

logger.debug(inNode.toHtml());

content.append(inNode.toHtml());

if (isCreateFile)

content.append("\n");

}

}else{

Node node = list.elementAt(listIndex);

if (node == null){

logger.warn("the listIndex may is wrong! please do it");

return null;

}

NodeList sublist = node.getChildren();

if (sublist == null){

logger.warn("the listIndex may is wrong! please do it");

return null;

}

Node[] listNode = sublist.toNodeArray();

if (listNode == null){

logger.warn("the listIndex may is wrong! please do it");

return null;

}

for (Node inNode : listNode) {

if (HttpParserUtil.isTrimEmpty(inNode.getText()))

continue;

logger.debug(inNode.toHtml());

content.append(inNode.toHtml());

if (isCreateFile)

content.append("\n");

}

if (content.toString() == null) {

logger.warn("you get the text is null");

}

return content.toString();

}

/**

* 抓取meta标签

* @param list

* @return

public MetaModel getMetaInfo(NodeList list){

MetaModel metaModel = new MetaModel();

for (int index = 0; index < list.size(); index++) {

Node firstNode = list.elementAt(index);

if(!(firstNode instanceof Html))

continue;

NodeList htmlList = firstNode.getChildren();

for (int i = 0; i < htmlList.size(); i++) {

Node htmlNode = htmlList.elementAt(i);

if (!(htmlNode instanceof HeadTag))

continue;

NodeList headList = htmlNode.getChildren();

for(int j = 0; j < headList.size(); j++){

Node headNode = headList.elementAt(j);

if(headNode instanceof TitleTag){

TitleTag titleTag = (TitleTag) headNode;

metaModel.setTitle(titleTag.getTitle());

}

if (!(headNode instanceof MetaTag))

continue;

MetaTag it = (MetaTag) headNode;

if(it.getMetaTagName()==null)

continue;

String keywords = it.getMetaTagName().toLowerCase();

if ("keywords".equals(keywords)) {

metaModel.setKeywords(it.getMetaContent().replaceAll("Hoopchina", keyWords));

} else if ("description".equals(keywords)) {

metaModel.setDescription(it.getMetaContent()

.replaceAll("Hoopchina", keyWords)

.replaceAll("虎扑体育论坛", keyWords));

}

}//end headList

}//end htmlList

break;

}//end

return metaModel;

}

/**

* @param currDate

* 刚抓取的时间

* @param lastDate

* 数据库中最新时间

* @return

public static boolean isDateAfter(String currDate, String lastDate) {

try {

DateFormat df = new SimpleDateFormat("E, dd MMM yyyy HH:mm:ss",

Locale.US);

return df.parse(currDate).after(df.parse(lastDate));

} catch (ParseException e) {

e.printStackTrace();

return false;

}

/**

* X-Powered-By=[mod_xlayout_jh/0.0.1vhs.markII.remix]

* ETag=["16be25-1cc81-150d9280"] null=[HTTP/1.0 200 OK] Date=[Mon, 17 Dec

* 2007 07:10:23 GMT] Content-Type=[text/html] Cache-Control=[max-age=60]

* Connection=[close] Expires=[Mon, 17 Dec 2007 07:11:23 GMT]

* Accept-Ranges=[bytes] X-Cache=[HIT from sh-14.sina.com.cn]

* Server=[Apache/2.0.59 (Unix)] Last-Modified=[Mon, 17 Dec 2007 07:08:42

* GMT] Vary=[Accept-Encoding]

public String getUrlDate(String urlAddr) {

// DateFormat df = new SimpleDateFormat("E, dd MMM yyyy

// HH:mm:ss",Locale.US);

String date = null;

try {

if (logger.isInfoEnabled())

logger.info("now open the " + urlAddr);

URL url = new URL(urlAddr);

URLConnection conn = url.openConnection();

conn.connect();

conn.setConnectTimeout(10*1000);

Map<String, List<String>> map = conn.getHeaderFields();

Set<String> set = map.keySet();

for (String key : set) {

if (logger.isDebugEnabled())

logger.debug(key + "=" + map.get(key));

}

if (conn.getHeaderFields().toString().indexOf("200 OK") == -1) {

logger.warn(urlAddr + "can't connect!");

return null;

}

date = conn.getHeaderField("Date");

// Date=[Mon, 17 Dec 2007 07:10:23 GMT]

// df.parse(date);

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

return date;

}

/**

* X-Powered-By=[mod_xlayout_jh/0.0.1vhs.markII.remix]

* ETag=["16be25-1cc81-150d9280"] null=[HTTP/1.0 200 OK] Date=[Mon, 17 Dec

* 2007 07:10:23 GMT] Content-Type=[text/html] Cache-Control=[max-age=60]

* Connection=[close] Expires=[Mon, 17 Dec 2007 07:11:23 GMT]

* Accept-Ranges=[bytes] X-Cache=[HIT from sh-14.sina.com.cn]

* Server=[Apache/2.0.59 (Unix)] Last-Modified=[Mon, 17 Dec 2007 07:08:42

* GMT] Vary=[Accept-Encoding]

public HttpHeads getHttpHeads(String urlAddr) {

// DateFormat df = new SimpleDateFormat("E, dd MMM yyyy

// HH:mm:ss",Locale.US);

String date = null;

try {

if (logger.isInfoEnabled())

logger.info("now open the " + urlAddr);

URL url = new URL(urlAddr);

URLConnection conn = url.openConnection();

conn.connect();

conn.setConnectTimeout(10*1000);

Map<String, List<String>> map = conn.getHeaderFields();

Set<String> set = map.keySet();

for (String key : set) {

if (logger.isDebugEnabled())

logger.debug(key + "=" + map.get(key));

}

logger.debug("contentLength()=" + conn.getContentLength());

if (conn.getHeaderFields().toString().indexOf("200 OK") == -1) {

logger.warn(urlAddr + "can't connect!");

return null;

}

HttpHeads httpHeads = new HttpHeads();

date = conn.getHeaderField("Date");

httpHeads.setDate(date);

httpHeads.setAccept_Ranges(conn.getHeaderField("Accept-Ranges"));

httpHeads.setCache_Control(conn.getHeaderField("Cache-Control"));

httpHeads.setConnection(conn.getHeaderField("Connection"));

httpHeads.setContent_Type(conn.getHeaderField("Content_Type"));

httpHeads.setETag(conn.getHeaderField("ETag"));

httpHeads.setExpires(conn.getHeaderField("Expires"));

httpHeads.setLast_Modified(conn.getHeaderField("Last_Modified"));

httpHeads.setServer(conn.getHeaderField("Server"));

httpHeads.setVary(conn.getHeaderField("Vary"));

httpHeads.setX_Cache(conn.getHeaderField("Cache"));

httpHeads.setX_Powered_By(conn.getHeaderField("Powered_By"));

httpHeads.setContentLength(conn.getContentLength());

return httpHeads;

// Date=[Mon, 17 Dec 2007 07:10:23 GMT]

// df.parse(date);

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

return null;

}

public BaseAction() {

super();

}

这个不知大家明白不?

参考资料

HTML Parser 项目首页 http://htmlparser.sourceforge.net/
下载地址 http://sourceforge.net/projects/htmlparser

发表于 2008-01-17 14:40 BT下载与小说520 阅读(4877) 评论(6) 编辑收藏

# re: 如何分析网页-----使用HtmlParser(1) 回复更多评论

不错,应该挺好用的,不用再自己用正则去分析了

千里冰封评论于 2008-01-17 14:43

# re: 如何分析网页-----使用HtmlParser(1)[未登录] 回复更多评论

以后在不用自己写程序解析了~~~~~

Simple 评论于 2008-01-17 14:52

唉,可我自己的小网站 http://www.yaonba.com 没有人来光顾啊.郁闷啊.

王能评论于 2008-01-17 14:55

网上还有个，可以参考
http://www.felix-colibri.com/papers/web/web_spider/web_spider.html

不过这种分析法始终有很多元素还是会被遗漏的。尤其是使用纯js写的。

stanley_xu 评论于 2008-01-17 19:44

俺不懂js的啊.

王能评论于 2008-01-17 23:09

对于js展示的东西，parse不负责解析的

明天去公司用这个东西解析一下最近抓取的网页

lnang 评论于 2008-01-19 20:38

姚明的NBA

BT下载

小说520

常用链接

留言簿(3)

随笔档案(28)

文章档案(1)

我最爱的网站

搜索

最新评论

阅读排行榜

评论排行榜