#关键字
python,pycurl, SGMLParser ,sqlite3, 抓取 , 增量计算,encodeuri 转换,crontab ,mail
#参考
sqlite3 http://linuxgazette.net/109/chirico1.html
SGMLParser
http://www.woodpecker.org.cn/diveintopython/html_processing/index.html
提取 python
import pycurl
from sgmllib import SGMLParser
import re
from urllib import quote, unquote
#使用 SGMLParser(html 分析) 类继承
#详细请查看
# http://www.woodpecker.org.cn/diveintopython/html_processing/index.html
class BaiduTop_GMLParser(SGMLParser):
def reset(self, verbose=0):
SGMLParser.reset(self)
self.data=[]
self.a = None
def start_a(self, attrs):
href = [v for k, v in attrs if k=='href']
rsc=re.search('word=(.*)\+(.*)', href[0] )
if href and rsc :
#baidu 页面编码为 gbk ,并且中文 encodeuri 了
#此转换为 utf8
music=unquote(rsc.group(1)).decode('gbk').encode('utf8')
actors=unquote(rsc.group(2)).decode('gbk').encode('utf8')
self.data.append((actors,music))
self.a=True
def getData(self):
return self.data
def __init__(self):
self.reset()
c = pycurl.Curl()
c.setopt(pycurl.URL, 'http://list.mp3.baidu.com/topso/mp3topsong.html?id=1?top2')
import StringIO
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
#c.setopt(pycurl.PROXY, 'http://11.11.11.11:8080')
#c.setopt(pycurl.PROXYUSERPWD, 'aaa:aaa')
c.perform()
self.feed(b.getvalue())
使用 py
#!python
# -*- coding: UTF8 -*-
'''
新添加入 top
当天全量
没有 歌手名
退出 top
drop table baidu_Top ;
create table baidu_Top (
id integer auto_increment PRIMARY KEY ,
actor varchar(300) ,
music varchar(300) ,
createTime DATE
);
'''
import sqlite3,os,sys
import datetime, calendar
import pdb
from baiduTop500 import BaiduTop_GMLParser
class Action():
def __init__(self,conn,data):
self.conn = conn
self.data = data
self.allData = []
self.newData = []
self.newNotActorData=[]
self.allNotActorData=[]
def insertAll(self):
insertSql = "insert into baidu_Top (actor,music,createTime) values (?,?,date()) ;"
isSql = "select music from baidu_Top where actor=? and music=? and createTime=strftime('%Y-%m-%d',?) ;"
isSql2 = "select music from baidu_Top where actor=? and music=? and createTime=strftime('%Y-%m-%d',?) ;"
cur = self.conn.cursor()
for actor,music in self.data :
try :
cur.execute(isSql,(actor,music,datetime.date.today()) )
res = cur.fetchall()
if not res :
cur.execute(insertSql,(actor,music) )
self.allData.append( (actor,music) )
if actor=="" or actor==" " :
self.allNotActorData.append((actor,music))
conn.commit()
todate=(datetime.date.today()-datetime.timedelta(days=1))
cur.execute(isSql2,(actor,music,todate) )
if not cur.fetchall() :
self.newData.append( (actor,music) )
if actor=="" or actor==" " :
self.newNotActorData.append((actor,music))
conn.commit()
except Exception, myError:
excType, excValue, traceBack = sys.exc_info()
print excType
print myError
try :
cur.close()
except:
pass
def pfor(title,data):
for a,m in data :
print "%s\t%s\t%s" %(title,a,m)
def line():
print
print "___________________________________________________________________________________"
print "___________________________________________________________________________________"
print
if __name__ == "__main__":
try:
conn = sqlite3.connect("/home/xj_liukaiyi/src/python/baidu_top/ex500")
ac = Action(conn,BaiduTop_GMLParser().getData())
ac.insertAll()
#ac.insertNewByDate()
print '''说明 %s :
new 对比前一天新添加
new not actor 对比前一天新增加但没歌手名
all 当天top 500 展现全部
all not actor 当天 top 500 展现全部全但没歌手 ''' %(datetime.date.today())
line()
pfor("new",ac.newData)
line()
pfor("new not actor",ac.newNotActorData)
line()
pfor("all",ac.allData)
line()
pfor("all not actor",ac.allNotActorData)
finally:
conn.close()
再通过系统
crontab -e
邮箱 gbk 转码 ,后发送 。每天早上 5点
0 5 * * * /usr/local/bin/python /home/xj_liukaiyi/src/python/baidu_top/Action.py|perl -MEncode -ne 'print encode("GBK", decode("UTF-8",$_));' > tmp ; mail -s "baidu Top 500" liukaiyi@gmail.com < tmp;
整理 www.blogjava.net/Good-Game