import
org.htmlparser.Node;
import
org.htmlparser.NodeFilter;
import
org.htmlparser.Parser;
import
org.htmlparser.filters.TagNameFilter;
import
org.htmlparser.tags.TableTag;
import
org.htmlparser.util.NodeList;
/**
* <br>
* 标题: <br>
* 功能概要: <br>
* 版权: cityyouth.cn (c) 2005 <br>
* 公司:上海城市青年网 <br>
* 创建时间:2005-12-21 <br>
* 修改时间: <br>
* 修改原因:
*
*
@author
张伟
*
@version
1.0
*/
public
class
TestYahoo {
public
static
void
testHtml() {
try
{
String sCurrentLine;
String sTotalString;
sCurrentLine
=
""
;
sTotalString
=
""
;
java.io.InputStream l_urlStream;
java.net.URL l_url
=
new
java.net.URL(
"
http://sports.sina.com.cn/iframe/nba/live/
"
);
java.net.HttpURLConnection l_connection
=
(java.net.HttpURLConnection) l_url
.openConnection();
l_connection.connect();
l_urlStream
=
l_connection.getInputStream();
java.io.BufferedReader l_reader
=
new
java.io.BufferedReader(
new
java.io.InputStreamReader(l_urlStream));
while
((sCurrentLine
=
l_reader.readLine())
!=
null
) {
sTotalString
+=
sCurrentLine;
}
System.out.println(sTotalString);
System.out.println(
"
====================
"
);
String testText
=
extractText(sTotalString);
System.out.println(testText);
}
catch
(Exception e) {
e.printStackTrace();
}
}
/**
* 抽取纯文本信息
*
*
@param
inputHtml
*
@return
*/
public
static
String extractText(String inputHtml)
throws
Exception {
StringBuffer text
=
new
StringBuffer();
Parser parser
=
Parser.createParser(
new
String(inputHtml.getBytes(),
"
8859_1
"
),
"
8859-1
"
);
//
遍历所有的节点
NodeList nodes
=
parser.extractAllNodesThatMatch(
new
NodeFilter() {
public
boolean
accept(Node node) {
return
true
;
}
});
Node node
=
nodes.elementAt(
0
);
text.append(
new
String(node.toPlainTextString().getBytes(
"
8859_1
"
)));
return
text.toString();
}
/**
* 读取文件的方式来分析内容. filePath也可以是一个Url.
*
*
@param
resource
* 文件/Url
*/
public
static
void
test5(String resource)
throws
Exception {
Parser myParser
=
new
Parser(resource);
//
设置编码
myParser.setEncoding(
"
GBK
"
);
String filterStr
=
"
table
"
;
NodeFilter filter
=
new
TagNameFilter(filterStr);
NodeList nodeList
=
myParser.extractAllNodesThatMatch(filter);
TableTag tabletag
=
(TableTag) nodeList.elementAt(
11
);
System.out.println(tabletag.toHtml());
System.out.println(
"
==============
"
);
}
/*
* public static void main(String[] args) { TestYahoo testYahoo = new
* TestYahoo(); testYahoo.testHtml(); }
*/
public
static
void
main(String[] args)
throws
Exception {
test5(
"
http://sports.yahoo.com/nba/scoreboard
"
);
}
}
posted on 2006-09-15 10:04
阿成 阅读(3951)
评论(0) 编辑 收藏 所属分类:
Open source