Posted on 2007-09-10 12:27
ZelluX 阅读(416)
评论(2) 编辑 收藏 所属分类:
Scripting
connector.py
import urllib, urllib2, cookielib
class MyConnector:
def __init__(self):
pass
def login(self, url):
cookie = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
urllib2.install_opener(opener)
str = urllib.urlencode({'id': 'guest', 'passwd': ''})
self.sock = urllib2.urlopen(url, str)
def getHTML(self, url):
self.sock = urllib2.urlopen(url)
return self.sock.read()
yanxiparser.py
from sgmllib import SGMLParser
import re
class YanxiURLParser(SGMLParser):
def reset(self):
self.result = []
SGMLParser.reset(self)
def start_a(self, attrs):
for (k, v) in attrs:
if (k == 'href' and (v.find('bbsanc') >= 0)):
self.result.append(v)
class YanxiHTMLParser:
def parse(self, html):
uid = ufrom = ubirth = ufav = ''
html = html.replace(r' ', ' ')
html = html.replace(r'<br />', '')
pattern = '\xbe\xcd\xca\xc7(.*)\xc0\xb2'
matchObject = re.search(pattern, html)
uid = matchObject.group(1)
uid = uid.strip()
pattern = '\xc0\xb4\xd7\xd4(.*)\xa3(\xac|xa1)'
matchObject = re.search(pattern, html)
ufrom = matchObject.group(1)
ufrom = ufrom.strip()
pattern = '\xcf\xb2\xbb\xb6(.*)\n'
matchObject = re.search(pattern, html)
ufav = matchObject.group(1)
ufav = ufav.strip()
pattern = '\n(.*)\xca\xc7\xce\xd2\xb5\xc4\xc9\xfa\xc8\xd5'
matchObject = re.search(pattern, html)
ubirth = matchObject.group(1)
ubirth = ubirth.strip()
return {"id" : uid, "from" : ufrom, "birth" : ubirth, "fav" : ufav}
runner.py
from connector import MyConnector
from yanxiparser import *
rootURL = 'http://yanxibbs.cn'
loginURL = 'http://yanxibbs.cn/bbslogin.php'
url1 = 'http://yanxibbs.cn/cgi-bin/bbs/bbs0an?path=%2Fgroups%2FGROUP%5F3%2F06SS%2Fbyxx%2Fbjcy'
url2 = 'http://yanxibbs.cn/cgi-bin/bbs/bbs0an?path=%2Fgroups%2FGROUP%5F3%2F06SS%2Fbyxx%2Fbjyr'
conn = MyConnector()
conn.login(loginURL)
def printInfo(url):
html = conn.getHTML(url)
urlParser = YanxiURLParser()
htmlParser = YanxiHTMLParser()
urlParser.feed(html)
for targetURL in urlParser.result:
html = conn.getHTML(rootURL + targetURL)
info = htmlParser.parse(html)
print "%(id)s\t%(from)s\t%(birth)s\t%(fav)s" % info
printInfo(url1)
printInfo(url2)