package com.rupeng.search.discuz;
import java.net.URLConnection;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.tags.Div;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;
import org.htmlparser.visitors.NodeVisitor;
public class DiscuzDefaultStyleHTMLParser
{
private String title;
private String bodyText;
public DiscuzDefaultStyleHTMLParser(URLConnection urlConnection) throws ParserException
{
Parser parser = new Parser(urlConnection);
HtmlPage visitor = new HtmlPage(parser);
parser.visitAllNodesWith(visitor);
this.title = visitor.getTitle();
NodeList nodeList = visitor.getBody();
final StringBuffer sb = new StringBuffer();
nodeList.visitAllNodesWith(new NodeVisitor() {
@Override
public void visitTag(Tag tag)
{
//因为主题、回帖都是包含在Div里,而且主题、回帖的divid都是以“postmessage_”开头
if (tag instanceof Div)
{
Div div = (Div) tag;
String divId = div.getAttribute("id");
if (divId != null&& divId.startsWith("postmessage_"))
{
sb.append(div.getStringText());
}
}
}
});
this.bodyText = sb.toString();
}
public String getTitle()
{
return title;
}
public String getThreadText()
{
return bodyText;
}
}