Python下载百度新歌100的代码

软件和网站开发以及相关技术探讨
头像
oneleaf
论坛管理员
帖子: 10441
注册时间: 2005-03-27 0:06
系统: Ubuntu 12.04

Python下载百度新歌100的代码

#1

帖子 oneleaf » 2006-05-24 16:44

最新代码见21楼

代码: 全选

#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2006 UbuntuChina <http://www.ubuntu.org.cn>
# License: GPLv2
# Author: oneleaf <oneleaf AT gmail.com>

import httplib
import re
import urllib
import os
import locale

def getdownurl(url):
    urllist=[]
    conn = httplib.HTTPConnection('mp3.baidu.com')
    conn.request("GET",url)
    response = conn.getresponse()
    html=response.read()
    conn.close()
    expression='http://220.181.27.54/m(.*)</a>'
    listSentence = re.findall(expression, html)
    lineno=0
    while lineno<len(listSentence): 
        mp3url=re.search('title=(.*)onclick',listSentence[lineno])
        if mp3url:
           mp3url=mp3url.group(0)
           mp3url=re.search('http(\S*)',mp3url)
           if mp3url:
              mp3url=mp3url.group(0)
              try:
                  mp3url=mp3url.decode('gbk')
              except:pass
              urllist.append(mp3url)
        lineno+=2
    return urllist

def downmp3(url,author,name,filelist):
    filename=author+"-"+name;
    for i in filelist:
        name=unicode(i,locale.getpreferredencoding())
        if name.find(filename) == 0:
            print u"文件已经下载,忽略。"
            return 1
    urllists=getdownurl(url)
    for i in urllists:        
        print u"正在连接",i
        
        ext=i[-4:]
        try:
            urlopen = urllib.URLopener()
            fp=urlopen.open(i)
            data = fp.read()
            fp.close()
            filename=filename+ext;
            file=open(filename,'w+b')
            file.write(data)
            file.close()
            print u"下载成功!"
            return 1
        except:
            continue
    return 0

if __name__ == "__main__":
    conn = httplib.HTTPConnection('list.mp3.baidu.com')
    conn.request("GET",'/list/newhits.html?id=1')
    response = conn.getresponse()
    html=response.read().decode('gbk')
    conn.close()
    expression='<a href="http://mp3.baidu.com/m(.*)</a>'
    listSentence = re.findall(expression, html)
    lineno=0
    while lineno<len(listSentence): 
       url=re.search('(.*)target',listSentence[lineno])
       url='/m'+url.group(0)[:-8]
       name=re.search('blank>(.*)',listSentence[lineno])
       name=name.group(0)[6:]
       author=re.search('blank>(.*)',listSentence[lineno+1])
       author=author.group(0)[6:]
       print u"开始下载",author,name
       filelist=os.listdir('.');
       if downmp3(url,author,name,filelist)==0:
          print u"下载",author,name,u'失败!'
       lineno+=2
        
上次由 oneleaf 在 2007-08-06 12:04,总共编辑 3 次。
redhairboy
帖子: 38
注册时间: 2006-03-07 0:37

#2

帖子 redhairboy » 2006-05-28 0:49

:P :P :P
不错不错
收了
头像
oneleaf
论坛管理员
帖子: 10441
注册时间: 2005-03-27 0:06
系统: Ubuntu 12.04

#3

帖子 oneleaf » 2006-08-13 0:03

更新。

代码: 全选

#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2006 UbuntuChina <http://www.ubuntu.org.cn>
# License: GPLv2
# Author: oneleaf <oneleaf AT gmail.com>

import httplib
import re
import urllib
import os
import locale

def getdownfileurl(url):
    url = "http://220.181.27.54/m"+url
    tn = re.search('&tn=(.*)&word',url).group(0)
    url=url.replace(tn,'&tn=baidusg,mp3%20%20&word')
    try:
        urlopen = urllib.URLopener()
        fp=urlopen.open(url)
        data = fp.read()
        fp.close()
    except IOError, errmsg:
        print errmsg
    expression='"_blank">(.*)</a></a></li>'
    url = re.search(expression, data).group(0)[16:-13]
    try:
        url="http://"+urllib.quote(url)
    except:pass
    print u"发现 "+url
    return url

def getdownurl(url):
    urllist=[]
    conn = httplib.HTTPConnection('mp3.baidu.com')
    conn.request("GET",url)
    response = conn.getresponse()
    html=response.read()
    conn.close()
    expression='http://220.181.27.54/m(.*)" target'
    listSentence = re.findall(expression, html)
    lineno=0
    while lineno<len(listSentence):
        mp3url=getdownfileurl(listSentence[lineno])
        urllist.append(mp3url)
        lineno+=1
    return urllist

def downmp3(url,author,name,filelist):
    filename=author+"-"+name;
    for i in filelist:
        name=unicode(i,locale.getpreferredencoding())
        if name.find(filename) == 0:
            print u"文件已经下载,忽略。"
            return 1
    urllists=getdownurl(url)
    for i in urllists:       
        print u"正在连接",i
        ext=i[-4:]
        try:
            urlopen = urllib.URLopener()
            fp=urlopen.open(i)
            data = fp.read()
            fp.close()
            filename=filename+ext;
            file=open(filename,'w+b')
            file.write(data)
            file.close()
            print u"下载成功!"
            return 1
        except:
            continue
    return 0

if __name__ == "__main__":
    conn = httplib.HTTPConnection('list.mp3.baidu.com')
    conn.request("GET",'/list/newhits.html?id=1')
    response = conn.getresponse()
    html=response.read().decode('gbk')
    conn.close()
    expression='<a href="http://mp3.baidu.com/m(.*)</a>'
    listSentence = re.findall(expression, html)
    lineno=0
    while lineno<len(listSentence):
       url=re.search('(.*)target',listSentence[lineno])
       url='/m'+url.group(0)[:-8]
       name=re.search('blank>(.*)',listSentence[lineno])
       name=name.group(0)[6:]
       author=re.search('blank>(.*)',listSentence[lineno+1])
       author=author.group(0)[6:]
       print u"开始下载",author,name
       filelist=os.listdir('.');
       if downmp3(url,author,name,filelist)==0:
          print u"下载",author,name,u'失败!'
       lineno+=2 
maven
帖子: 92
注册时间: 2005-03-26 0:02

根据歌曲名下载的,不是下载100首的版本

#4

帖子 maven » 2006-08-16 16:28

代码: 全选

#!/usr/bin/python
# -*- coding: utf8 -*-
# Copyright (c) 2006 UbuntuChina <http://www.ubuntu.org.cn>
# License: GPLv2
# old Author: oneleaf <oneleaf AT gmail.com>
# Modifier: petit<mawei.81 AT gmail.com>

import urllib
import BaseHTTPServer,SocketServer
import httplib
import re

htmlhead='''
<html>
<head>
<META content="text/html; charset=gbk" http-equiv=Content-Type>
<title>MP3 Search</title>
<style type="text/css">
td.row1   { background-color: #fff8f2; }
td.row2   { background-color: #f8f5ee; }
</style>
</head>
<body>
<form>
<input name="word" value="%s">&nbsp;<input value="Search"  type="submit">
</form>
'''
htmlfood='''
</body>
</html>
'''

def get(self,word):
    conn = httplib.HTTPConnection('mp3.baidu.com')
    #url='/m?f=ms&tn=baidump3lyric&ct=150994944&lf=2&rn=10&word='+word+'&lm=-1'
    url='/m?f=ms&rn=&tn=baidump3&ct=134217728&word='+word+'&lm=-1'
    conn.request("GET",url)
    response = conn.getresponse()
    html=response.read()
    #expression='<div style="padding-left:10px;line-height:20px;padding-top:1px">.*\n.*'
    expression='http://220.181.27.54/m(.*)</a>'
    lineno=0
    listSentence = re.findall(expression, html)
    while lineno<len(listSentence):
        mp3url=re.search('title=(.*)onclick',listSentence[lineno])
        if mp3url:
           mp3url=mp3url.group(0)
           mp3url=re.search('http(\S*)',mp3url)
           if mp3url:
              mp3url=mp3url.group(0)
     	      self.wfile.write(mp3url + '<br/>')
        lineno+=2

class RequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
    def do_GET(self):
        self.send_head()
        if self.path.find('word')==-1:
            self.wfile.write(htmlhead % '')
            self.wfile.write(htmlfood)
        else:
            word=self.path[7:]
            self.wfile.write(htmlhead % urllib.unquote(word))
            get(self,word)
            self.wfile.write(htmlfood)

    def send_head(self):
        self.send_response(200)
        self.send_header("content-type","text/html")
        self.end_headers()

class myWebServer(SocketServer.ThreadingMixIn,BaseHTTPServer.HTTPServer): pass

if __name__ == "__main__":
    server = myWebServer(('', 7000), RequestHandler)
    server.serve_forever()
janusle
帖子: 36
注册时间: 2006-08-14 11:40

#5

帖子 janusle » 2006-08-26 0:21

顶!强烈支持!^_^
头像
SuperWar3Fan
帖子: 1263
注册时间: 2006-05-20 6:25
来自: 山东淄博
联系:

#6

帖子 SuperWar3Fan » 2006-08-26 10:37

首先lz很强
试验了一下,发现一个问题:
对于多人合唱的歌曲无法正确识别
头像
deng
帖子: 130
注册时间: 2006-04-09 14:09
来自: 北京
联系:

#7

帖子 deng » 2006-09-21 15:17

不错不错,严重支持!
Programming is fun
==========================
http://oteam.cn
图片
头像
ct
帖子: 2201
注册时间: 2005-04-06 21:15
来自: 安徽黄山
联系:

#8

帖子 ct » 2006-10-06 1:01

1已经用上,不过讨厌rm这样的格式,干脆加一行代码,指定下格式。
mp3老是下不动,wma比较快:lol:

2靠,确实只要一有多人合唱后面就全乱七八糟了,我再改 :lol:

3简单实现限制下载文件的大小,避免误下手机铃声那样的选段版,当然,改改也能只选小的下,代码在原来的结构上改得有些乱,不过能凑合用了就不再多事了.

代码: 全选

#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2006 UbuntuChina <http://www.ubuntu.org.cn>
# License: GPLv2
# Author: oneleaf <oneleaf AT gmail.com>
# hack by ct <ctqucl AT gmail.com>

import httplib
import re
import urllib
import os
import locale
global m,topid
global fsize
fsize=2     #文件大小下限(M)
m='0'       #'-1'=任意  '0'=mp3 '1'=rm '2'=wma '3'=asf '4'=ram '5'=mid '6'=flash
topid='1' 

if topid=='0':
     topid='/list/newhits.html'         #新歌100
elif topid=='1':
     topid='/topso/mp3topsong.html'     #Top500
elif topid=='2':
     topid='/list/oldsong.html'         #老歌经典
elif topid=='3':
     topid='/list/movies.html'          #电影金曲
elif topid=='4':
     topid='/list/tvs.html'             #电视歌曲
elif topid=='5':
     topid='/minge/mp3topsong.html'     #民歌精选
elif topid=='6':
     topid='/xiaoyuan/mp3topsong.html'  #校园歌曲
elif topid=='7':
     topid='/list/liujinsuiyue.html'    #流金岁月(new)
elif topid=='8':
     topid='/list/yaogun.html'          #摇滚地带


def getdownfileurl(url):                #获取歌曲页的试听URL
    url = "http://220.181.27.54/m"+url
    tn = re.search('&tn=(.*)&word',url).group(0)
    url=url.replace(tn,'&tn=baidusg,mp3%20%20&word')
    try:
        urlopen = urllib.URLopener()
        fp=urlopen.open(url)
        data = fp.read()
        fp.close()
    except IOError, errmsg:
        print errmsg
    expression2='"_blank">(.*)</a></a></li>'
    url = re.search(expression2, data).group(0)[16:-13]
    try:
        url="http://"+urllib.quote(url)
    except:pass
    #print u"发现 "+url
    return url

def getdownurl(url):                     #从歌曲页抓取URL列表
    urllist=[]
    urllist1=[]
    urllist2=[]
    conn = httplib.HTTPConnection('mp3.baidu.com')
    conn.request("GET",url)
    response = conn.getresponse()
    html=response.read()
    conn.close()
    expression2='http://220.181.27.54/m(.*)" target'
    listSentence2 = re.findall(expression2, html)     #抓取链接列表
    filesize=re.findall('<td>(.*)M</td>',html)        #抓取文件大小
    lineno=0
    while lineno<len(listSentence2):
        mp3url=getdownfileurl(listSentence2[lineno])  #转换链接为最终下载地址
        urllist1.append(mp3url)
        lineno+=1
    urllist=map(None,urllist1,filesize)
    return urllist

def downmp3(url,author,name,filelist):    #下载歌曲
    filename=author+"-"+name;
    for i in filelist:
        name=unicode(i,locale.getpreferredencoding())
        if name.find(filename) == 0:      #忽略
            print u"文件已经下载,忽略。"
            return 1
    urllists=getdownurl(url)              #获取文件url列表 
    lineno=0
    while lineno<len(urllists):
        print u"尝试",urllists[lineno][0]
        ext=urllists[lineno][0][-4:]      #获取文件名后缀(最后4位)
        try:
            lineno+=1
            print urllists[lineno-1][1] +'M'
            if float(urllists[lineno-1][1])>float(fsize) :   #大小符合则下载
                  urlopen = urllib.URLopener()
                  fp=urlopen.open(urllists[lineno-1][0])
                  data = fp.read()
                  fp.close()
                  filename=filename+ext;
                  file=open(filename,'w+b')
                  file.write(data)
                  file.close()
                  print u"下载成功!"
                  return 1
            elif float(urllists[lineno][1])<float(fsize) :   #不符则略过
                  print u"文件太小,忽略!"
        except:
            continue
    return 0

if __name__ == "__main__":
    conn = httplib.HTTPConnection('list.mp3.baidu.com')
    conn.request("GET",topid )  #类型
    response = conn.getresponse()
    html=response.read().decode('gbk')
    conn.close()
    expression1='border">(.*).</td>'
    expression2='><a href="http://mp3.baidu.com/m(.*)</a>'
    expression3='href="http://mp3.baidu.com/m(.*)</td>'
    listSentence1 = re.findall(expression1, html)   #编号特征
    listSentence2 = re.findall(expression2, html)   #歌曲名特征
    listSentence3 = re.findall(expression3, html)   #歌手名特征
    lineno=0
    while lineno<len(listSentence1):
       listSentence2[lineno]=listSentence2[lineno].replace('m=-1','m=' + m) #指定格式
       url=re.search('(.*)target',listSentence2[lineno])
       url='/m'+url.group(0)[:-8]
       idno=listSentence1[lineno]
       name=re.search('blank>(.*)',listSentence2[lineno])
       name=name.group(0)[6:]
       dirty=re.search('</A>/<A  href=(.*) target=_blank>',listSentence3[lineno])
       if dirty is not None :             #合唱
	   author1=re.search('>(.*)</A>/<A',listSentence3[lineno])
	   author1=author1.group(0)[1:-7]
	   author2=re.search('/<A  href=(.*)</A>',listSentence3[lineno])
	   author2=re.search('>(.*)<',author2.group(0))
	   author2=author2.group(0)[1:-1]
	   author=author1 + '+' + author2
       elif dirty is None :               #独唱
           author=re.search('blank>(.*)</',listSentence3[lineno])
           author=author.group(0)[6:-2]
       print u"开始下载",idno,name,author
       filelist=os.listdir('.');
       if downmp3(url,author,name,filelist)==0:  #判断失败
          print u"下载",author,name,u'失败!'
       lineno+=1
newbuding
帖子: 1
注册时间: 2006-10-14 20:17

#9

帖子 newbuding » 2006-10-14 20:26

现在的baidu mp3察看最终地址的地方怎么用urllib打不开了?
sandorf
帖子: 16
注册时间: 2006-12-21 18:05

#10

帖子 sandorf » 2006-12-21 21:19

可以些一个multi-thread版本的,下起来快些
头像
yang119345
帖子: 570
注册时间: 2005-06-19 14:22
来自: 上海
联系:

#11

帖子 yang119345 » 2007-01-28 13:28

:D 好帅的帖子呀! :?
头像
forrid
帖子: 659
注册时间: 2007-04-23 17:40

#12

帖子 forrid » 2007-07-16 20:03

不是牛A,也不是牛C!

吾生也有涯,而知也无涯,以有涯随无涯,SB啊~~~~~~~~~~
头像
tyllrxs
帖子: 115
注册时间: 2006-09-10 14:48

#13

帖子 tyllrxs » 2007-07-29 5:07

试听页面的html格式似乎改动了,我帮着修改了一下,应该比较完美了 :wink: :wink:

代码: 全选

#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2006 UbuntuChina <http://www.ubuntu.org.cn>
# License: GPLv2
# Author: oneleaf <oneleaf AT gmail.com>
# hack by ct <ctqucl AT gmail.com>

import httplib
import re
import urllib
import os
import locale
global m,topid
global fsize
fsize=2     #文件大小下限(M)
m='0'       #'-1'=任意  '0'=mp3 '1'=rm '2'=wma '3'=asf '4'=ram '5'=mid '6'=flash
topid='0'

if topid=='0':
     topid='/list/newhits.html'         #新歌100
elif topid=='1':
     topid='/topso/mp3topsong.html'     #Top500
elif topid=='2':
     topid='/list/oldsong.html'         #老歌经典
elif topid=='3':
     topid='/list/movies.html'          #电影金曲
elif topid=='4':
     topid='/list/tvs.html'             #电视歌曲
elif topid=='5':
     topid='/minge/mp3topsong.html'     #民歌精选
elif topid=='6':
     topid='/xiaoyuan/mp3topsong.html'  #校园歌曲
elif topid=='7':
     topid='/list/liujinsuiyue.html'    #流金岁月(new)
elif topid=='8':
     topid='/list/yaogun.html'          #摇滚地带


def getdownfileurl(url):                #获取歌曲页的试听URL
    url = "http://220.181.38.82/m"+url    
    tn = re.search('&tn=(.*)&word',url).group(0)
    url=url.replace(tn,'&tn=baidusg,mp3%20%20&word')
    print url
    try:
        urlopen = urllib.URLopener()
        fp=urlopen.open(url)
        data = fp.read()
        fp.close()
    except IOError, errmsg:
        print errmsg
    expression2='"_blank">(.*)</a></li>'
    url = re.search(expression2, data).group(0)[16:-9]
    try:
        url="http://"+urllib.quote(url)
    except:pass
    print u"发现 "+url
    return url

def getdownurl(url):                     #从歌曲页抓取URL列表
    urllist=[]
    urllist1=[]
    urllist2=[]
    conn = httplib.HTTPConnection('mp3.baidu.com')
    conn.request("GET",url)
    response = conn.getresponse()
    html=response.read()
    conn.close()
    expression2='http://220.181.38.82/m(.*)" target'
    listSentence2 = re.findall(expression2, html)     #抓取链接列表
    filesize=re.findall('<td>(.*)M</td>',html)        #抓取文件大小
    lineno=0
    while lineno<len(listSentence2):
        mp3url=getdownfileurl(listSentence2[lineno])  #转换链接为最终下载地址
        urllist1.append(mp3url)
        lineno+=1
    urllist=map(None,urllist1,filesize)
    return urllist

def downmp3(url,author,name,filelist):    #下载歌曲
    filename=author+"-"+name;
    for i in filelist:
        name=unicode(i,locale.getpreferredencoding())
        if name.find(filename) == 0:      #忽略
            print u"文件已经下载,忽略。"
            return 1
    urllists=getdownurl(url)              #获取文件url列表
    lineno=0
    while lineno<len(urllists):
        print u"尝试",urllists[lineno][0]
        ext=urllists[lineno][0][-4:]      #获取文件名后缀(最后4位)
        try:
            lineno+=1
            print urllists[lineno-1][1] +'M'
            if float(urllists[lineno-1][1])>float(fsize) :   #大小符合则下载
                  urlopen = urllib.URLopener()
                  fp=urlopen.open(urllists[lineno-1][0])
                  data = fp.read()
                  fp.close()
                  filename=filename+ext;
                  file=open(filename,'w+b')
                  file.write(data)
                  file.close()
                  print u"下载成功!"
                  return 1
            elif float(urllists[lineno][1])<float(fsize) :   #不符则略过
                  print u"文件太小,忽略!"
        except:
            continue
    return 0

if __name__ == "__main__":
    conn = httplib.HTTPConnection('list.mp3.baidu.com')
    conn.request("GET",topid )  #类型
    response = conn.getresponse()
    html=response.read().decode('gbk')
    conn.close()
    expression1='border">(.*).</td>'
    expression2='><a href="http://mp3.baidu.com/m(.*)</a>'
    expression3='href="http://mp3.baidu.com/m(.*)</td>'
    listSentence1 = re.findall(expression1, html)   #编号特征
    listSentence2 = re.findall(expression2, html)   #歌曲名特征
    listSentence3 = re.findall(expression3, html)   #歌手名特征
    lineno=0
    while lineno<len(listSentence1):
       listSentence2[lineno]=listSentence2[lineno].replace('m=-1','m=' + m) #指定格式
       url=re.search('(.*)target',listSentence2[lineno])
       url='/m'+url.group(0)[:-8]
       idno=listSentence1[lineno]
       name=re.search('blank>(.*)',listSentence2[lineno])
       name=name.group(0)[6:]
       dirty=re.search('</A>/<A  href=(.*) target=_blank>',listSentence3[lineno])
       if dirty is not None :             #合唱
	     author1=re.search('>(.*)</A>/<A',listSentence3[lineno])
	     author1=author1.group(0)[1:-7]
	     author2=re.search('/<A  href=(.*)</A>',listSentence3[lineno])
	     author2=re.search('>(.*)<',author2.group(0))
	     author2=author2.group(0)[1:-1]
	     author=author1 + '+' + author2
       elif dirty is None :               #独唱
           author=re.search('blank>(.*)</',listSentence3[lineno])
           author=author.group(0)[6:-2]
       print u"开始下载",idno,name,author,u"来自",url
       filelist=os.listdir('.');
       if downmp3(url,author,name,filelist)==0:  #判断失败
          print u"下载",author,name,u'失败!'
       lineno+=1  
头像
tyllrxs
帖子: 115
注册时间: 2006-09-10 14:48

#14

帖子 tyllrxs » 2007-07-29 5:16

我已经在下了,见图片,真是太爽了 :lol: :lol:
附件
百度mp3下载中。。。
百度mp3下载中。。。
头像
内存不足
论坛版主
帖子: 3277
注册时间: 2005-08-18 18:43
联系:

#15

帖子 内存不足 » 2007-07-29 8:49

发现 http://www.znsjw.com/Ring/Up/20070701/2 ... 818759.mp3
http://220.181.38.82/m?ct=134217728&tn= ... 7euTE$.mp3,,[%D4%D9%BC%FB+%D5%C5%D5%F0%D4%C0]&si=%D4%D9%BC%FB+good+bye;;%D5%C5%D5%F0%D4%C0;;0;;0&lm=16777216


卡在这里不动了 是我的问题?连接的问题?好像碰到死连接就卡住了
໌→ iCookie Gtk Theme
໌→DropPlusBluetheme
กรัก กรัก`菠萝
回复