Python下载百度新歌100的代码

oneleaf · #1

最新代码见21楼

代码：全选

#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2006 UbuntuChina <http://www.ubuntu.org.cn>
# License: GPLv2
# Author: oneleaf <oneleaf AT gmail.com>

import httplib
import re
import urllib
import os
import locale

def getdownurl(url):
    urllist=[]
    conn = httplib.HTTPConnection('mp3.baidu.com')
    conn.request("GET",url)
    response = conn.getresponse()
    html=response.read()
    conn.close()
    expression='http://220.181.27.54/m(.*)</a>'
    listSentence = re.findall(expression, html)
    lineno=0
    while lineno<len(listSentence): 
        mp3url=re.search('title=(.*)onclick',listSentence[lineno])
        if mp3url:
           mp3url=mp3url.group(0)
           mp3url=re.search('http(\S*)',mp3url)
           if mp3url:
              mp3url=mp3url.group(0)
              try:
                  mp3url=mp3url.decode('gbk')
              except:pass
              urllist.append(mp3url)
        lineno+=2
    return urllist

def downmp3(url,author,name,filelist):
    filename=author+"-"+name;
    for i in filelist:
        name=unicode(i,locale.getpreferredencoding())
        if name.find(filename) == 0:
            print u"文件已经下载，忽略。"
            return 1
    urllists=getdownurl(url)
    for i in urllists:        
        print u"正在连接",i
        
        ext=i[-4:]
        try:
            urlopen = urllib.URLopener()
            fp=urlopen.open(i)
            data = fp.read()
            fp.close()
            filename=filename+ext;
            file=open(filename,'w+b')
            file.write(data)
            file.close()
            print u"下载成功!"
            return 1
        except:
            continue
    return 0

if __name__ == "__main__":
    conn = httplib.HTTPConnection('list.mp3.baidu.com')
    conn.request("GET",'/list/newhits.html?id=1')
    response = conn.getresponse()
    html=response.read().decode('gbk')
    conn.close()
    expression='<a href="http://mp3.baidu.com/m(.*)</a>'
    listSentence = re.findall(expression, html)
    lineno=0
    while lineno<len(listSentence): 
       url=re.search('(.*)target',listSentence[lineno])
       url='/m'+url.group(0)[:-8]
       name=re.search('blank>(.*)',listSentence[lineno])
       name=name.group(0)[6:]
       author=re.search('blank>(.*)',listSentence[lineno+1])
       author=author.group(0)[6:]
       print u"开始下载",author,name
       filelist=os.listdir('.');
       if downmp3(url,author,name,filelist)==0:
          print u"下载",author,name,u'失败！'
       lineno+=2

redhairboy · #2

${L_SMILIES_RAZZ}$ ${L_SMILIES_RAZZ}$ ${L_SMILIES_RAZZ}$
不错不错
收了

oneleaf · #3

更新。

代码：全选

#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2006 UbuntuChina <http://www.ubuntu.org.cn>
# License: GPLv2
# Author: oneleaf <oneleaf AT gmail.com>

import httplib
import re
import urllib
import os
import locale

def getdownfileurl(url):
    url = "http://220.181.27.54/m"+url
    tn = re.search('&tn=(.*)&word',url).group(0)
    url=url.replace(tn,'&tn=baidusg,mp3%20%20&word')
    try:
        urlopen = urllib.URLopener()
        fp=urlopen.open(url)
        data = fp.read()
        fp.close()
    except IOError, errmsg:
        print errmsg
    expression='"_blank">(.*)</a></a></li>'
    url = re.search(expression, data).group(0)[16:-13]
    try:
        url="http://"+urllib.quote(url)
    except:pass
    print u"发现 "+url
    return url

def getdownurl(url):
    urllist=[]
    conn = httplib.HTTPConnection('mp3.baidu.com')
    conn.request("GET",url)
    response = conn.getresponse()
    html=response.read()
    conn.close()
    expression='http://220.181.27.54/m(.*)" target'
    listSentence = re.findall(expression, html)
    lineno=0
    while lineno<len(listSentence):
        mp3url=getdownfileurl(listSentence[lineno])
        urllist.append(mp3url)
        lineno+=1
    return urllist

def downmp3(url,author,name,filelist):
    filename=author+"-"+name;
    for i in filelist:
        name=unicode(i,locale.getpreferredencoding())
        if name.find(filename) == 0:
            print u"文件已经下载，忽略。"
            return 1
    urllists=getdownurl(url)
    for i in urllists:       
        print u"正在连接",i
        ext=i[-4:]
        try:
            urlopen = urllib.URLopener()
            fp=urlopen.open(i)
            data = fp.read()
            fp.close()
            filename=filename+ext;
            file=open(filename,'w+b')
            file.write(data)
            file.close()
            print u"下载成功!"
            return 1
        except:
            continue
    return 0

if __name__ == "__main__":
    conn = httplib.HTTPConnection('list.mp3.baidu.com')
    conn.request("GET",'/list/newhits.html?id=1')
    response = conn.getresponse()
    html=response.read().decode('gbk')
    conn.close()
    expression='<a href="http://mp3.baidu.com/m(.*)</a>'
    listSentence = re.findall(expression, html)
    lineno=0
    while lineno<len(listSentence):
       url=re.search('(.*)target',listSentence[lineno])
       url='/m'+url.group(0)[:-8]
       name=re.search('blank>(.*)',listSentence[lineno])
       name=name.group(0)[6:]
       author=re.search('blank>(.*)',listSentence[lineno+1])
       author=author.group(0)[6:]
       print u"开始下载",author,name
       filelist=os.listdir('.');
       if downmp3(url,author,name,filelist)==0:
          print u"下载",author,name,u'失败！'
       lineno+=2

maven · #4

代码：全选

#!/usr/bin/python
# -*- coding: utf8 -*-
# Copyright (c) 2006 UbuntuChina <http://www.ubuntu.org.cn>
# License: GPLv2
# old Author: oneleaf <oneleaf AT gmail.com>
# Modifier: petit<mawei.81 AT gmail.com>

import urllib
import BaseHTTPServer,SocketServer
import httplib
import re

htmlhead='''
<html>
<head>
<META content="text/html; charset=gbk" http-equiv=Content-Type>
<title>MP3 Search</title>
<style type="text/css">
td.row1   { background-color: #fff8f2; }
td.row2   { background-color: #f8f5ee; }
</style>
</head>
<body>
<form>
<input name="word" value="%s">&nbsp;<input value="Search"  type="submit">
</form>
'''
htmlfood='''
</body>
</html>
'''

def get(self,word):
    conn = httplib.HTTPConnection('mp3.baidu.com')
    #url='/m?f=ms&tn=baidump3lyric&ct=150994944&lf=2&rn=10&word='+word+'&lm=-1'
    url='/m?f=ms&rn=&tn=baidump3&ct=134217728&word='+word+'&lm=-1'
    conn.request("GET",url)
    response = conn.getresponse()
    html=response.read()
    #expression='<div style="padding-left:10px;line-height:20px;padding-top:1px">.*\n.*'
    expression='http://220.181.27.54/m(.*)</a>'
    lineno=0
    listSentence = re.findall(expression, html)
    while lineno<len(listSentence):
        mp3url=re.search('title=(.*)onclick',listSentence[lineno])
        if mp3url:
           mp3url=mp3url.group(0)
           mp3url=re.search('http(\S*)',mp3url)
           if mp3url:
              mp3url=mp3url.group(0)
     	      self.wfile.write(mp3url + '<br/>')
        lineno+=2

class RequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
    def do_GET(self):
        self.send_head()
        if self.path.find('word')==-1:
            self.wfile.write(htmlhead % '')
            self.wfile.write(htmlfood)
        else:
            word=self.path[7:]
            self.wfile.write(htmlhead % urllib.unquote(word))
            get(self,word)
            self.wfile.write(htmlfood)

    def send_head(self):
        self.send_response(200)
        self.send_header("content-type","text/html")
        self.end_headers()

class myWebServer(SocketServer.ThreadingMixIn,BaseHTTPServer.HTTPServer): pass

if __name__ == "__main__":
    server = myWebServer(('', 7000), RequestHandler)
    server.serve_forever()

janusle · #5

顶！强烈支持！^_^

SuperWar3Fan · #6

首先lz很强
试验了一下，发现一个问题:
对于多人合唱的歌曲无法正确识别

deng · #7

不错不错，严重支持！

ct · #8

1已经用上，不过讨厌rm这样的格式，干脆加一行代码，指定下格式。
mp3老是下不动，wma比较快:lol:

2靠,确实只要一有多人合唱后面就全乱七八糟了,我再改 ${L_SMILIES_LAUGHING}$

3简单实现限制下载文件的大小，避免误下手机铃声那样的选段版，当然，改改也能只选小的下,代码在原来的结构上改得有些乱,不过能凑合用了就不再多事了.

代码：全选

#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2006 UbuntuChina <http://www.ubuntu.org.cn>
# License: GPLv2
# Author: oneleaf <oneleaf AT gmail.com>
# hack by ct <ctqucl AT gmail.com>

import httplib
import re
import urllib
import os
import locale
global m,topid
global fsize
fsize=2     #文件大小下限(M)
m='0'       #'-1'=任意  '0'=mp3 '1'=rm '2'=wma '3'=asf '4'=ram '5'=mid '6'=flash
topid='1' 

if topid=='0':
     topid='/list/newhits.html'         #新歌100
elif topid=='1':
     topid='/topso/mp3topsong.html'     #Top500
elif topid=='2':
     topid='/list/oldsong.html'         #老歌经典
elif topid=='3':
     topid='/list/movies.html'          #电影金曲
elif topid=='4':
     topid='/list/tvs.html'             #电视歌曲
elif topid=='5':
     topid='/minge/mp3topsong.html'     #民歌精选
elif topid=='6':
     topid='/xiaoyuan/mp3topsong.html'  #校园歌曲
elif topid=='7':
     topid='/list/liujinsuiyue.html'    #流金岁月(new)
elif topid=='8':
     topid='/list/yaogun.html'          #摇滚地带


def getdownfileurl(url):                #获取歌曲页的试听URL
    url = "http://220.181.27.54/m"+url
    tn = re.search('&tn=(.*)&word',url).group(0)
    url=url.replace(tn,'&tn=baidusg,mp3%20%20&word')
    try:
        urlopen = urllib.URLopener()
        fp=urlopen.open(url)
        data = fp.read()
        fp.close()
    except IOError, errmsg:
        print errmsg
    expression2='"_blank">(.*)</a></a></li>'
    url = re.search(expression2, data).group(0)[16:-13]
    try:
        url="http://"+urllib.quote(url)
    except:pass
    #print u"发现 "+url
    return url

def getdownurl(url):                     #从歌曲页抓取URL列表
    urllist=[]
    urllist1=[]
    urllist2=[]
    conn = httplib.HTTPConnection('mp3.baidu.com')
    conn.request("GET",url)
    response = conn.getresponse()
    html=response.read()
    conn.close()
    expression2='http://220.181.27.54/m(.*)" target'
    listSentence2 = re.findall(expression2, html)     #抓取链接列表
    filesize=re.findall('<td>(.*)M</td>',html)        #抓取文件大小
    lineno=0
    while lineno<len(listSentence2):
        mp3url=getdownfileurl(listSentence2[lineno])  #转换链接为最终下载地址
        urllist1.append(mp3url)
        lineno+=1
    urllist=map(None,urllist1,filesize)
    return urllist

def downmp3(url,author,name,filelist):    #下载歌曲
    filename=author+"-"+name;
    for i in filelist:
        name=unicode(i,locale.getpreferredencoding())
        if name.find(filename) == 0:      #忽略
            print u"文件已经下载，忽略。"
            return 1
    urllists=getdownurl(url)              #获取文件url列表 
    lineno=0
    while lineno<len(urllists):
        print u"尝试",urllists[lineno][0]
        ext=urllists[lineno][0][-4:]      #获取文件名后缀(最后4位)
        try:
            lineno+=1
            print urllists[lineno-1][1] +'M'
            if float(urllists[lineno-1][1])>float(fsize) :   #大小符合则下载
                  urlopen = urllib.URLopener()
                  fp=urlopen.open(urllists[lineno-1][0])
                  data = fp.read()
                  fp.close()
                  filename=filename+ext;
                  file=open(filename,'w+b')
                  file.write(data)
                  file.close()
                  print u"下载成功!"
                  return 1
            elif float(urllists[lineno][1])<float(fsize) :   #不符则略过
                  print u"文件太小,忽略!"
        except:
            continue
    return 0

if __name__ == "__main__":
    conn = httplib.HTTPConnection('list.mp3.baidu.com')
    conn.request("GET",topid )  #类型
    response = conn.getresponse()
    html=response.read().decode('gbk')
    conn.close()
    expression1='border">(.*).</td>'
    expression2='><a href="http://mp3.baidu.com/m(.*)</a>'
    expression3='href="http://mp3.baidu.com/m(.*)</td>'
    listSentence1 = re.findall(expression1, html)   #编号特征
    listSentence2 = re.findall(expression2, html)   #歌曲名特征
    listSentence3 = re.findall(expression3, html)   #歌手名特征
    lineno=0
    while lineno<len(listSentence1):
       listSentence2[lineno]=listSentence2[lineno].replace('m=-1','m=' + m) #指定格式
       url=re.search('(.*)target',listSentence2[lineno])
       url='/m'+url.group(0)[:-8]
       idno=listSentence1[lineno]
       name=re.search('blank>(.*)',listSentence2[lineno])
       name=name.group(0)[6:]
       dirty=re.search('</A>/<A  href=(.*) target=_blank>',listSentence3[lineno])
       if dirty is not None :             #合唱
	   author1=re.search('>(.*)</A>/<A',listSentence3[lineno])
	   author1=author1.group(0)[1:-7]
	   author2=re.search('/<A  href=(.*)</A>',listSentence3[lineno])
	   author2=re.search('>(.*)<',author2.group(0))
	   author2=author2.group(0)[1:-1]
	   author=author1 + '+' + author2
       elif dirty is None :               #独唱
           author=re.search('blank>(.*)</',listSentence3[lineno])
           author=author.group(0)[6:-2]
       print u"开始下载",idno,name,author
       filelist=os.listdir('.');
       if downmp3(url,author,name,filelist)==0:  #判断失败
          print u"下载",author,name,u'失败！'
       lineno+=1

newbuding · #9

现在的baidu mp3察看最终地址的地方怎么用urllib打不开了？

sandorf · #10

可以些一个multi-thread版本的，下起来快些

yang119345 · #11

${L_SMILIES_VERY_HAPPY}$ 好帅的帖子呀！ ${L_SMILIES_CONFUSED}$

forrid · #12

不是牛A，也不是牛C！

tyllrxs · #13

试听页面的html格式似乎改动了，我帮着修改了一下，应该比较完美了 ${L_SMILIES_WINK}$ ${L_SMILIES_WINK}$

代码：全选

#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2006 UbuntuChina <http://www.ubuntu.org.cn>
# License: GPLv2
# Author: oneleaf <oneleaf AT gmail.com>
# hack by ct <ctqucl AT gmail.com>

import httplib
import re
import urllib
import os
import locale
global m,topid
global fsize
fsize=2     #文件大小下限(M)
m='0'       #'-1'=任意  '0'=mp3 '1'=rm '2'=wma '3'=asf '4'=ram '5'=mid '6'=flash
topid='0'

if topid=='0':
     topid='/list/newhits.html'         #新歌100
elif topid=='1':
     topid='/topso/mp3topsong.html'     #Top500
elif topid=='2':
     topid='/list/oldsong.html'         #老歌经典
elif topid=='3':
     topid='/list/movies.html'          #电影金曲
elif topid=='4':
     topid='/list/tvs.html'             #电视歌曲
elif topid=='5':
     topid='/minge/mp3topsong.html'     #民歌精选
elif topid=='6':
     topid='/xiaoyuan/mp3topsong.html'  #校园歌曲
elif topid=='7':
     topid='/list/liujinsuiyue.html'    #流金岁月(new)
elif topid=='8':
     topid='/list/yaogun.html'          #摇滚地带


def getdownfileurl(url):                #获取歌曲页的试听URL
    url = "http://220.181.38.82/m"+url    
    tn = re.search('&tn=(.*)&word',url).group(0)
    url=url.replace(tn,'&tn=baidusg,mp3%20%20&word')
    print url
    try:
        urlopen = urllib.URLopener()
        fp=urlopen.open(url)
        data = fp.read()
        fp.close()
    except IOError, errmsg:
        print errmsg
    expression2='"_blank">(.*)</a></li>'
    url = re.search(expression2, data).group(0)[16:-9]
    try:
        url="http://"+urllib.quote(url)
    except:pass
    print u"发现 "+url
    return url

def getdownurl(url):                     #从歌曲页抓取URL列表
    urllist=[]
    urllist1=[]
    urllist2=[]
    conn = httplib.HTTPConnection('mp3.baidu.com')
    conn.request("GET",url)
    response = conn.getresponse()
    html=response.read()
    conn.close()
    expression2='http://220.181.38.82/m(.*)" target'
    listSentence2 = re.findall(expression2, html)     #抓取链接列表
    filesize=re.findall('<td>(.*)M</td>',html)        #抓取文件大小
    lineno=0
    while lineno<len(listSentence2):
        mp3url=getdownfileurl(listSentence2[lineno])  #转换链接为最终下载地址
        urllist1.append(mp3url)
        lineno+=1
    urllist=map(None,urllist1,filesize)
    return urllist

def downmp3(url,author,name,filelist):    #下载歌曲
    filename=author+"-"+name;
    for i in filelist:
        name=unicode(i,locale.getpreferredencoding())
        if name.find(filename) == 0:      #忽略
            print u"文件已经下载，忽略。"
            return 1
    urllists=getdownurl(url)              #获取文件url列表
    lineno=0
    while lineno<len(urllists):
        print u"尝试",urllists[lineno][0]
        ext=urllists[lineno][0][-4:]      #获取文件名后缀(最后4位)
        try:
            lineno+=1
            print urllists[lineno-1][1] +'M'
            if float(urllists[lineno-1][1])>float(fsize) :   #大小符合则下载
                  urlopen = urllib.URLopener()
                  fp=urlopen.open(urllists[lineno-1][0])
                  data = fp.read()
                  fp.close()
                  filename=filename+ext;
                  file=open(filename,'w+b')
                  file.write(data)
                  file.close()
                  print u"下载成功!"
                  return 1
            elif float(urllists[lineno][1])<float(fsize) :   #不符则略过
                  print u"文件太小,忽略!"
        except:
            continue
    return 0

if __name__ == "__main__":
    conn = httplib.HTTPConnection('list.mp3.baidu.com')
    conn.request("GET",topid )  #类型
    response = conn.getresponse()
    html=response.read().decode('gbk')
    conn.close()
    expression1='border">(.*).</td>'
    expression2='><a href="http://mp3.baidu.com/m(.*)</a>'
    expression3='href="http://mp3.baidu.com/m(.*)</td>'
    listSentence1 = re.findall(expression1, html)   #编号特征
    listSentence2 = re.findall(expression2, html)   #歌曲名特征
    listSentence3 = re.findall(expression3, html)   #歌手名特征
    lineno=0
    while lineno<len(listSentence1):
       listSentence2[lineno]=listSentence2[lineno].replace('m=-1','m=' + m) #指定格式
       url=re.search('(.*)target',listSentence2[lineno])
       url='/m'+url.group(0)[:-8]
       idno=listSentence1[lineno]
       name=re.search('blank>(.*)',listSentence2[lineno])
       name=name.group(0)[6:]
       dirty=re.search('</A>/<A  href=(.*) target=_blank>',listSentence3[lineno])
       if dirty is not None :             #合唱
	     author1=re.search('>(.*)</A>/<A',listSentence3[lineno])
	     author1=author1.group(0)[1:-7]
	     author2=re.search('/<A  href=(.*)</A>',listSentence3[lineno])
	     author2=re.search('>(.*)<',author2.group(0))
	     author2=author2.group(0)[1:-1]
	     author=author1 + '+' + author2
       elif dirty is None :               #独唱
           author=re.search('blank>(.*)</',listSentence3[lineno])
           author=author.group(0)[6:-2]
       print u"开始下载",idno,name,author,u"来自",url
       filelist=os.listdir('.');
       if downmp3(url,author,name,filelist)==0:  #判断失败
          print u"下载",author,name,u'失败！'
       lineno+=1

tyllrxs · #14

我已经在下了，见图片，真是太爽了 ${L_SMILIES_LAUGHING}$ ${L_SMILIES_LAUGHING}$

内存不足 · #15

发现 http://www.znsjw.com/Ring/Up/20070701/2 ... 818759.mp3
http://220.181.38.82/m?ct=134217728&tn= ... 7euTE$.mp3,,[%D4%D9%BC%FB+%D5%C5%D5%F0%D4%C0]&si=%D4%D9%BC%FB+good+bye;;%D5%C5%D5%F0%D4%C0;;0;;0&lm=16777216

卡在这里不动了是我的问题？连接的问题？好像碰到死连接就卡住了

Python下载百度新歌100的代码

Python下载百度新歌100的代码

根据歌曲名下载的,不是下载100首的版本