Python 2.5 写的抓取www.weather.com.cn天气预报程序

软件和网站开发以及相关技术探讨
回复
头像
oneleaf
论坛管理员
帖子: 10441
注册时间: 2005-03-27 0:06
系统: Ubuntu 12.04

Python 2.5 写的抓取www.weather.com.cn天气预报程序

#1

帖子 oneleaf » 2009-02-10 9:23

来源: http://www.lalfa.com/index.php/2009/02/06/440/
代码写于一年前,一直没有用在实际系统中。不知道针对现在的天气预报网站是不是有效,不过对各位应该又很大的参考价值

使用BeautifulSoup做HTML分析。

抓取最近的5天数据,并保存到mysql数据库中。

如果出现处理失败,会向指定的邮件地址,发送报警。这是一个比较完善的天气预报抓取程序。

代码: 全选

#! /usr/bin/env python
# -*- coding: utf-8 -*-

"""
版权:www.lalfa.com  ealpha@gmail.com
转载请不要删除
todo :设置一个字段表示,是否成功更新,一旦成功则记录ID,系统运行结束,进行update。否则发送短信。保证只发送一次
"""
import os,urllib2,re,MySQLdb,datetime,time,smtplib
from BeautifulSoup import BeautifulSoup
from StringIO import StringIO
from email.mime.text import MIMEText

USER_AGENT = '	Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1'
BASE_URL_BEGIN= 'http://www.weather.com.cn/html/weather/'
BASE_URL_END = '.shtml'
conn = MySQLdb.connect(host="localhost", user="fun", passwd="fun", db="fun",use_unicode=1, charset='utf8')

#Mail 接收方邮件
mailto_list=["XXXXX@XXXXX.com"]
#SMTP 服务器,用户名、口令以及邮箱的后缀
mail_host="XXXX.com"
mail_user="XXXX"
mail_pass="XXXX"
mail_postfix="imichat.com"

#失败的WID,记录用来判断,是否发送邮件
faultwid = []
#失败重试次数
dotime = 0

def send_mail(to_list,sub,content):
'''
to_list:发给谁
sub:主题
content:内容
send_mail("aaa@126.com","sub","content")
'''
me=mail_user+"<"+mail_user+"@"+mail_postfix+">"
msg = MIMEText(content)
msg['Subject'] = sub
msg['From'] = me
msg['To'] = ";".join(to_list)
try:
s = smtplib.SMTP()
s.connect(mail_host)
s.login(mail_user,mail_pass)
s.sendmail(me, to_list, msg.as_string())
s.close()
return True
except Exception, e:
print str(e)
return False

def getFiveDayWeather(wid,pageid,agent=USER_AGENT):
"""
将需要的数据表格从整个网页取出来
"""
url = BASE_URL_BEGIN + pageid + BASE_URL_END

#print '*************************'
#print url

request = urllib2.Request(url)
request.add_header('User-Agent', agent)
opener = urllib2.build_opener()
allhtml = StringIO(str((opener.open(request)).read()))
soup = BeautifulSoup(allhtml, fromEncoding="utf-8")

html = soup.find('div', id='dd_0').parent.contents

getWeatherList(wid,html)

return html

def getWeatherList(wid,html):
"""
取得最后发布时间,已经5天数据
"""
soup1 = BeautifulSoup(str(html))
time = soup1.find('h2')
update_time = ''
for t in time:
update_time = t
#print '\r'
#print update_time
#print '\r'
html2 = soup1.findAll('div', { "class" : "fut_weatherbox" })
dayid = 0
for dayweather in html2:
dayid += 1
getOneDayWeather(wid,dayid,update_time,dayweather)
pass

def getOneDayWeather(wid,dayid,update_time,html):
"""
分析一天的天气预报信息
"""
soup = BeautifulSoup(StringIO(str(html)), fromEncoding="UTF-8")
day  = soup.findAll('h3')
imgs = soup.findAll('img')
t00  = soup.findAll('h4', { "class" : "temp00_dn" })
t01  = soup.findAll('h4', { "class" : "temp01_dn" })
t02  = soup.findAll('h4', { "class" : "temp02_dn" })
t03  = soup.findAll('h4', { "class" : "temp03_dn" })
#print '----------------------'
soup_h3 = BeautifulSoup(StringIO(str(day)), fromEncoding="UTF-8")
day_value = soup_h3.h3.renderContents()

#for img in imgs:  为了确定值,不使用循环
soup_img = BeautifulSoup(StringIO(str(imgs[0])), fromEncoding="UTF-8")
imgsrc = soup_img.first('img')['src']
d_pic_value =  imgsrc.split('/')[-1].split('.')[-2]

soup_img = BeautifulSoup(StringIO(str(imgs[1])), fromEncoding="UTF-8")
imgsrc = soup_img.first('img')['src']
n_pic_value = imgsrc.split('/')[-1].split('.')[-2]

soup_t00 = BeautifulSoup(StringIO(str(t00)), fromEncoding="UTF-8")
weather_value =  soup_t00.h4.renderContents()

soup_t01 = BeautifulSoup(StringIO(str(t01)), fromEncoding="UTF-8")
max_temp = soup_t01.h4.renderContents()

soup_t02 = BeautifulSoup(StringIO(str(t02)), fromEncoding="UTF-8")
min_temp = soup_t02.h4.renderContents()

soup_t03 = BeautifulSoup(StringIO(str(t03)), fromEncoding="UTF-8")
wind =  soup_t03.h4.renderContents()

insertDB(wid,dayid,update_time,day_value,d_pic_value,n_pic_value,weather_value,max_temp,min_temp,wind )

def insertDB(wid,dayid,update_time,day_value,d_pic_value,n_pic_value,weather_value,max_temp,min_temp,wind ):
"""
插入数据库,此处要修改,5天数据一次commit(),异常rollback()
"""
cursor_uodate=conn.cursor()
sql="INSERT INTO weatherdetail( wid, dayid, lastupdate, currdate, dpic, npic,weather, maxtemp, mintemp, wind) VALUES( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
param = (wid,dayid,update_time ,day_value,d_pic_value,n_pic_value,weather_value,max_temp,min_temp,wind)
#print param
n=cursor_uodate.execute(sql,param)
conn.commit()
#print n

def sendMonitor():
"""
处理失败,发送报警邮件或短信
"""
if len(faultwid) <> 0 :

if send_mail(mailto_list,"Error: Get Weather Error "+str(datetime.datetime.now()),str(faultwid)):
print "监控邮件发送成功."
else:
print "监控邮件发送失败."
pass

def doworking(dotime,wid,pageid):
"""
业务处理入口
"""
try:
getFiveDayWeather(wid,pageid)
except (NameError,Exception),e:
print "has one error on %s %s , then do it again , waiting five secs." % (wid,pageid)
time.sleep(5)
if dotime < 3 :
doworking(dotime + 1,wid,pageid)
else:
faultwid.append(wid)
pass

if __name__ == "__main__":
"""
入口函数
"""

starttime = datetime.datetime.now()

print "Start."+str(starttime)
cursor = conn.cursor()
cursor.execute("SELECT id,weather_com_cn_pageid FROM weather")
result = cursor.fetchall()
for record in result:
# 将 dotime 恢复到 0 ,代表本次请求首次执行
doworking(0,str(record[0]),record[1])
#time.sleep(2)

print '\r'
endtime = datetime.datetime.now()
print "End."+str(endtime)
print "-------------------------------------------------"
sendMonitor()
print (endtime - starttime).seconds
头像
anticlockwise
帖子: 2394
注册时间: 2007-03-01 20:46
来自: 湖南长沙

Re: Python 2.5 写的抓取www.weather.com.cn天气预报程序

#2

帖子 anticlockwise » 2009-02-11 0:58

哈哈哈哈~~当年大四的时候我在UML课上个大家讲课就写了这个程序,不过使用面向对象的理念写的。我还记得是从weather.com获取HTML,用BeautifulSoup解析,获取天气信息,然后用Cheetah生成XML和用MySQL-Python存到数据库,并用Cheetah生成HTML和用Reportlab生成PDF两份报表
soiamso
帖子: 418
注册时间: 2008-09-06 2:00

Re: Python 2.5 写的抓取www.weather.com.cn天气预报程序

#3

帖子 soiamso » 2009-04-17 17:20

Google Weather API 加 Google GeoLocation API 也是不错的。

Google GeoLocation API 得到访问ip或地名的大概经纬度;
Google Weather API 通过经纬度或地名得到现时天气及天气预报,及表示相应天气icon的URL

发帖再写一个 :em11
头像
percy
帖子: 508
注册时间: 2006-09-10 8:19
系统: Gentoo/Mac OS X
来自: Shanghai,China
联系:

Re: Python 2.5 写的抓取www.weather.com.cn天气预报程序

#4

帖子 percy » 2009-08-05 17:15

看到的代码没有缩进,这里的有:http://gist.github.com/61926
头像
percy
帖子: 508
注册时间: 2006-09-10 8:19
系统: Gentoo/Mac OS X
来自: Shanghai,China
联系:

Re: Python 2.5 写的抓取www.weather.com.cn天气预报程序

#5

帖子 percy » 2009-08-06 0:13

那个数据库怎么创建的,有哪些表?
回复