以前写过的一个从网上提取天气信息的类,参照了公司老前辈们的代码,可能不太规范,但基本实现,主要就是对页面源码的解析和有用信息的截取,取出来得都是有规律的字符串信息,可根据需要存进数据库,进行应用。代码如下:
1 package parsehtml;
2
3 import java.io.BufferedReader;
4 import java.io.InputStreamReader;
5 import java.net.HttpURLConnection;
6 import java.net.URL;
7 import java.util.ArrayList;
8 import java.util.Iterator;
9 import java.util.List;
10
11 public class ParseHtml extends Thread {
12 /*
13 * 解析网址运行
14 *
15 */public void run() {
16 try {
17 // 河北天气
18 String urlAddress = "http://www.weathercn.com/forecast/province.jsp?province=hebei";
19 startParse(urlAddress);
20 } catch (Exception e) {
21 e.printStackTrace();
22 System.out.println("网络错误,提取天气数据出错!");
23 }
24 }
25
26 /*
27 *
28 * 开始解析网址
29 *
30 *
31 */public void startParse(String urlAddress) throws Exception {
32 System.out.println("开始提取网址:" + urlAddress);
33 URL url = new URL(urlAddress);
34 HttpURLConnection httpConnection = (HttpURLConnection) url
35 .openConnection();
36 httpConnection.setRequestProperty("User-Agent", "Mozilla");
37 httpConnection.setRequestProperty("Connection", "Keep-Alive");
38
39 int responseCode = 0;
40 try {
41 responseCode = httpConnection.getResponseCode();
42 } catch (Exception ex) {
43 System.out.println("读取网页失败,返回代码:" + responseCode);
44 }
45 System.out.println("读取网页反回代码:" + responseCode);
46
47 // 获得输入流
48 InputStreamReader ir = new InputStreamReader(httpConnection
49 .getInputStream());
50 if (ir != null) {
51 BufferedReader reader = new BufferedReader(ir);
52 System.out.println(reader);
53 if (reader != null)
54 // 调用从何处取数据
55 isStartPoint(reader, "99", 1);
56 reader.close();
57 ir.close();
58 }
59
60 }
61
62 private void isStartPoint(BufferedReader reader, String tag, int number)
63 throws Exception {
64 String CurrentLine = "";
65
66 // 从流中读取一行字符串(html源文件)
67 while ((CurrentLine = reader.readLine()) != null) {
68
69 // 循环查询整个 CurrentLine 中的 tag,查到一个就将计数据器 number 减 1
70 int fromIndex = 0;
71 while ((number != 0)
72 && (CurrentLine.toUpperCase().indexOf(tag.toUpperCase(),
73 fromIndex) != -1)) {
74 fromIndex = CurrentLine.toUpperCase().indexOf(
75 tag.toUpperCase(), fromIndex) + 1;
76 if (fromIndex > 0)
77 number--;
78 }
79
80 // 如果到了起始点即 number == 0 时,开始执行取数据操作
81 List sb = new ArrayList();
82 if ((CurrentLine.indexOf("citydetail") > 0)
83 && (CurrentLine.indexOf("99") > 0)) {
84 sb.add(this.processBuffer(CurrentLine));
85 // 截取天气信
86 CurrentLine = reader.readLine();
87 CurrentLine = reader.readLine();
88 if (CurrentLine != null) {
89 sb.add(this.processBuffer(CurrentLine));
90 }
91 // 截取最低气温
92 CurrentLine = reader.readLine();
93 if (CurrentLine != null) {
94 sb.add(this.processBuffer(CurrentLine));
95 }
96 }
97 StringBuffer s = new StringBuffer();
98 // 将所有的截取信息汇总进行处理,用‘,’间隔便于以后截取相应信息
99 for (Iterator it = sb.iterator(); it.hasNext();) {
100 String i = it.next().toString();
101 s.append(i);
102 }
103 String Tq = s.toString();
104 String[] Tqxx = Tq.split(",");
105 if (Tqxx.length >= 3) {
106 System.out.println(Tq);
107 }
108 }
109 }
110
111 /*
112 * 判断并从网页上截取
113 *
114 * @param old
115 */
116 private String processBuffer(String strLine) throws Exception {
117 // 保存当前取得的 城市
118 StringBuffer sb = new StringBuffer();
119 String Tqxx;
120 // 当当前行含有“城市”时,截取相应的城市名称
121 if (strLine.indexOf("citydetail") > 0) {
122 Tqxx = subString(strLine, "sta_id", "<");
123 Tqxx = Tqxx.substring(24);
124 sb = sb.append(Tqxx + ",");
125 }
126 if (strLine.indexOf("alt") > 0) {
127 Tqxx = subString(strLine, "alt", ">");
128 Tqxx = Tqxx.substring(1);
129 sb = sb.append(Tqxx + ",");
130 }
131 if (strLine.indexOf("strong") > 0) {
132 strLine = strLine.replaceAll(" ", "");
133 Tqxx = subString(strLine, "strong>", "<");
134 String Tqxx1 = subString(strLine, "-", "</");
135 Tqxx1 = Tqxx1.substring(8);
136 Tqxx = Tqxx + "~" + Tqxx1;
137 sb = sb.append(Tqxx + ",");
138 }
139 return sb.toString();
140
141 }
142
143 /*
144 * 返回在 strSourc 的 strStart ,strEnd 之间的字符串
145 *
146 */
147 private String subString(String strSource, String strStart, String strEnd) {
148 strSource = strSource.toUpperCase();
149 strStart = strStart.toUpperCase();
150 strEnd = strEnd.toUpperCase();
151 int intStart = strSource.indexOf(strStart);
152 int intEnd = strSource.indexOf(strEnd, intStart);
153 String strRetu = " ";
154 if (intStart == -1)
155 return strRetu;
156 if ((intEnd != -1) && (intEnd > intStart)) {
157 strRetu = strSource.substring(intStart + strStart.length(), intEnd);
158 } else {
159 strRetu = strSource.substring(intStart + strStart.length());
160 }
161 return strRetu.trim();
162 }
163
164 public ParseHtml() {
165
166 }
167
168 public static void main(String args[]) {
169 ParseHtml p = new ParseHtml();
170 p.run();
171 }
172 }
173