#! /usr/bin/python
import urllib, re,sys,os,os.path,getopt
threadNum = 20
savePath = "mp3-2"
optlist,left = getopt.getopt(sys.argv[1:], "t:d:")
for opt in optlist:
print opt
if (opt[0]=='-t'):
threadNum = int(opt[1])
if (opt[0]=='-d'):
savePath = opt[1]
print "threadnum="+str(threadNum)
print "savePath="+savePath
def getUrlData(url):
num = 0
while (num<3):
num = num+1
try :
f = urllib.urlopen( url )
data = f.readlines()
f.close()
return data
except:
pass
return []
data = getUrlData(url)
pattern = re.compile( r'href="(.*?tsomp3.htm)' )
target = [];
for line in data:
if ( line.find( "tsomp3.htm" )!=-1 ):
items = pattern.findall( line )
for item in items:
target.append( item )
print "find ",len( target )," mp3 "
mp3Pattern = re.compile( r'href="(.*?\.mp3)"' )
titlePattern = re.compile( r'<title>.*?_(.*?)\s+</title>' )
import threading
lock = threading.Lock()
def getMp3():
while True:
t = ""
lock.acquire()
if ( len( target )>0 ):
t = target[0]
target.remove( t )
else :
return
lock.release()
tempUrl = base+t
data = getUrlData(tempUrl)
mp3Target = []
title = "";
for line in data:
if ( line.find( "title" )!=-1 ):
m = titlePattern.search( line )
if ( m ):
title = m.group( 1 )
break
for line in data:
if ( len( mp3Target )>10 ):
break
if ( line.find( ".mp3" )!=-1 ):
items = mp3Pattern.findall( line )
for item in items:
mp3Target.append( item )
filename = savePath+"/"+title+".mp3"
for t in mp3Target:
try :
print "try to get "+title+".mp3,url=",t
ret = urllib.urlretrieve( t, filename )
size = os.path.getsize(filename)
if (size>500*1024):
print "done:"+title+".mp3"
break
except :
print "fail to get "+title+".mp3 with url "+t
pass
for num in range(threadNum):
thread = threading.Thread( None, getMp3 )
thread.start()
print "start thread ",num