代码: 全选
#!/usr/bin/python
#coding=utf-8
import urllib2
import time
def get_value(slice):
st=slice.find('"')
se=slice.find('"',st+1)
return slice[st+1:se]
def parse_page_content(content,flag,flag_start,flag_end):
r_list=[]
from_index=0
while True:
start_d=content.find(flag,from_index)
if start_d==-1:
return r_list
start_d_from=content.find(flag_start,start_d)
start_d_end=content.find(flag_end,start_d_from+1)
each=content[start_d_from+1:start_d_end]
params=each.split(",")
r_list.append((get_value(params[0]),get_value(params[1]),get_value(params[2]),get_value(params[3])))
from_index=start_d_end
return r_list
def main():
times=110
keyword=raw_input("Search Keyword>> ")
if keyword is None or keyword.strip()=="":
print "Please give the search keyword"
return
search_url="http://mp3.youdao.com/search?q="+str(keyword)+"&t="
search_handle=urllib2.urlopen(search_url)
search_content=search_handle.read()
search_handle.close()
flag_start="<!--result-num begin-->"
flag_end="<!--result-num end-->"
sd=search_content.find(flag_start)
if sd==-1:
print "Your search have no results"
return
se=search_content.find(flag_end,sd+len(flag_start))
num=search_content[sd+len(flag_start):se]
num=int(num.strip().replace(" ",""))
print "Your search have about "+str(num)+" results"
result_list=[]
#get all the url links on this page
r_list=parse_page_content(search_content,"openDetailUrl",'(',')')
result_list.extend(r_list)
#go through the result pages one by one
PAGE_COUNT=20
for i in range(2,PAGE_COUNT+1):
g_ind_from=(i-1)*PAGE_COUNT;
url="http://mp3.youdao.com/search?q="+str(keyword)+"&start="+str(g_ind_from)+"&ue=utf8&keyfrom=music.page"+str(i)+"&t=&len="+str(PAGE_COUNT)
try:
s_h=urllib2.urlopen(url)
s_d=s_h.read()
s_h.close()
r_r_list=parse_page_content(s_d,"openDetailUrl","(",")")
result_list.extend(r_r_list)
except:
break
mp3Urls=[]
#get mp3 urls
counter=1
e_counter=1
for each in result_list:
q=each[0]
pos=each[2]
d=each[1]
keyfrom=each[3]
t2=int(time.time())
url="http://mp3.youdao.com/samesong?q="+str(keyword)+"&pos="+str(pos)+"&d="+str(d)+"&keyfrom="+str(keyfrom)+"&t2="+str(t2)
print "("+str(counter)+") Processing "+url
try:
pg_h=urllib2.urlopen(url)
pg_content=pg_h.read()
pg_h.close()
from_ind=0
while True:
st_ind=pg_content.find("href",from_ind)
if st_ind==-1:
break
st_ind_f=pg_content.find('"',st_ind)
st_ind_e=pg_content.find('"',st_ind_f+1)
mp3url=pg_content[st_ind_f+1:st_ind_e]
tmp=mp3url.lower();
if tmp.startswith("http://"):
if tmp.endswith(".mp3") or tmp.endswith(".wma"):
mp3Urls.append(mp3url.strip())
from_ind=st_ind_e
counter+=1
except:
print "("+str(e_counter)+") Processing "+url
e_counter+=1
mp3UrlsSet=set()
for each in mp3Urls:
mp3UrlsSet.add(each)
fname=("mp3list-"+str(keyword)+".sh").replace(" ","-")
fp=open(fname,"w")
for each in mp3UrlsSet:
print >>fp,"wget "+each
fp.close()
if __name__=="__main__":
main()