用Ruby脚本抓取百度MP3新歌Top 100

sevk · #1

代码：全选

require 'rubygems'
require 'hpricot'
require 'open-uri'
require 'iconv'

def _mktab(x)
  t0 = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
  p = t0.partition(x)
  p[1] + p[2] + p[0]
end

def decode(s)
  s.tr(_mktab(s[0].chr), s=~ /....:\// ? _mktab('h') : _mktab('f')) #http|ftp
end

def tiny_url(url, encode = false)
  result = open(encode ? URI.encode("http://tinyurl.com/api-create.php?url=#{url}") : "http://tinyurl.com/api-create.php?url=#{url}").read
  result =~ /^Error/ ? url : result
end

def actual_download_url(search_url, limit = 2)
  doc = Hpricot(open(search_url))
  doc.search("table#Tbs td.d a").map{|a| a.attributes["href"]}[0..limit].map {|url|
    decode(open(URI.encode(url)).read[/var I="([^"]*)"/, 1])
  }
end

def get_maidu_mp3_top100
  url = "http://list.mp3.baidu.com/list/newhits.html"
  doc = Hpricot(Iconv.conv("UTF8", "GBK", open(url).read))
  result = {}
  doc.search("table.list td:not(.th)").each{|t|
    name = t.inner_text.gsub(/\s+/, " ")
    search_url = t.search("a")[0].attributes["href"]
    result[name] = search_url
  }
  return result
end

def get_new_data
  local_data = File.open('data.yaml') { |file| YAML::load(file) } rescue {}
  remote_data = get_maidu_mp3_top100
  new_data = {}
  remote_data.each_pair { |key, value|
    unless local_data.has_key?(key)
      local_data[key] = value
      new_data[key] = value
    end
  }
  File.open('data.yaml', 'w') { |file| YAML.dump(local_data, file) }
  return new_data
end

原文： http://www.javaeye.com/topic/336765

eexpress · #2

幸好ruby基础了perl的精华。

ibear · #3

这个脚本能用？

我记得最近百度把下载页面地址放在js中生成了，貌似一个简单的加密，无法直接抓到了（在python中复制了该js，是可以解出正确的地址的，不过觉得百度太恶，干脆懒得上百度找歌了）

lerosua · #4

ruby 不会用

用Ruby脚本抓取百度MP3新歌Top 100

用Ruby脚本抓取百度MP3新歌Top 100

Re: 用Ruby脚本抓取百度MP3新歌Top 100

Re: 用Ruby脚本抓取百度MP3新歌Top 100

Re: 用Ruby脚本抓取百度MP3新歌Top 100