Ubuntu中文论坛

发表于： **2013-12-07 22:57**

想找一个软件,通过编写特定的正则表达式实现对网站内容的抓取,然后可设定时间定期检查网站是否有更新,在windows下有一个软件叫"推乎"可以实现上述功能http://www.anypush.in/
Baidu和GOOGLE了没有发现类似的软件,难道得自己用Python编写吗?
求助一下版内的各位大侠,看大家有没有高招.

发表于： **2013-12-08 15:44**

https://github.com/lilydjwg/nvchecker
@lilydjwg

发表于： **2013-12-08 22:36**

yhylord 写了：https://github.com/lilydjwg/nvchecker
@lilydjwg

谢谢yhylord
自己学习写了python下的抓取爱听fm中坏蛋调频频道的内容，并且用notify但是太粗糙了
贴出来大家一起提意见吧
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import urllib
import urllib2
import sys
import re
import difflib
#####抓取网页内容######
url = "http://www.itings.com/badfm/usercontent_2590p58543"
req = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"})
webpage= urllib2.urlopen(req)
websource = webpage.read()
web = open("web.txt",'w')
web.write(websource)
web.close()
p1 = re.compile('audioPath="(.*?mp3)')
p2 = re.compile('proName=(\".*?\")', re.DOTALL)
content1 = p1.findall(websource)
content2 = p2.findall(websource)
####判断source.txt是否已经存在#####
a = os.system("ls source.txt")
if(a == 512):
# print "source.txt不存在"
f1 = open("source.txt",'w')
f4 = open("list.txt", 'w')
for i in content1:
f1.write(i)
f1.write('\n')
f1.close()
for j in content2:
f4.write(j)
f4.write('\n')
f4.close()
else:
# print "source.txt文件已经存在"
f2 = open("newsource.txt", 'w')
f3 = open("newlist.txt", 'w')
for i in content1:
f2.write('%r\n' % (i))
for j in content2:
f3.write(j)
f3.write('\n')
f2.close()
#####比较模块######
def isDiff(srcfile, tarfile):
'''
compare with two files,if equal then return ture
'''
src = file(srcfile).read().split(' ')
tar = file(tarfile).read().split(' ')
ret = 1
# ignore blank lines
temp = difflib.SequenceMatcher(lambda x: len(x.strip()) == 0, src, tar)
for tag, i1, i2, j1, j2 in temp.get_opcodes():
#print tag
if tag != 'equal':
ret = 0
break
return (True if ret == 1 else False)
judge = isDiff("source.txt","newsource.txt")
if (judge == False) :
os.system("notify-send 坏蛋调频没有更新")
else:
os.system("notify-send 坏蛋调频更新了")
print "是否需要调用uget来下载新货？> ",
a=raw_input()
if (a == yes):
os.system("/usr/bin/uget")
else:
print "好吧，俺休息去了"
s1 = open("source.txt", 'w')
s2 = open("newsource.txt")
s3 = s2.read()
s1.write(s3)
s2.close()
s1.close()

发表于： **2013-12-08 23:25**

检查网站是否有更新还是页面是否有更新呢，查看特定页面完全可以用浏览器插件么，updatescanner

发表于： **2013-12-09 22:11**

onlylove 写了：检查网站是否有更新还是页面是否有更新呢，查看特定页面完全可以用浏览器插件么，updatescanner

应该是对页面内特定内容检查是否有更新，比如说在itingFM上的坏蛋调频有没有新的节目更新。

发表于： **2013-12-09 22:26**

这两天用python2写了个能初步实现自己想要功能的小程序，可以新建一个文件然后在文件的第一列输入网址，第二列输入要抓取标题的正则表达式，第三列输入要抓取链接的正则表达式，然后程序+新建文件的文件名运行程序，如新建文件名为itingfm
itingfm文件中有三列分别是：
http://www.itings.com/badfm/usercontent_2590p58543
audioPath="(.*?mp3)
proName=(\".*?\")
然后运行python2 程序名 itingfm 就能获得想要的标题和链接。
下边是程序的源代码，感觉还是太繁杂了，后边慢慢修改吧，接下来还想将其转换成一个GUI程序，但是还不知道怎么弄，不知道有没有强人能帮忙实现。

代码：全选

#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import urllib
import urllib2
import sys
import re
import difflib
from sys import argv
import string
#####抓取网页内容######
script, filename = argv
n1 = open(filename)
lines = n1.readlines()
n1.close()
urls = lines[0] #需要抓取的网址
linkpattern = lines[1] #抓取网站内链接的正则
linkpattern = linkpattern.split()
linkpattern = ' '.join(linkpattern)#两次转换后正常了
titlepattern = lines[2] #抓取标题的正则
titlepattern = titlepattern.split()
titlepattern = ' '.join(titlepattern)#两次转换后正常了
req = urllib2.Request(urls, headers={'User-Agent' : "Magic Browser"})
webpage= urllib2.urlopen(req)
websource = webpage.read()
web = open("web.txt",'w')
web.write(websource)
web.close()
p1 = re.compile(linkpattern,re.DOTALL)
p2 = re.compile(titlepattern,re.DOTALL)
content1 = p1.findall(websource)
content2 = p2.findall(websource)
####判断source.txt是否已经存在#####
pathway = os.path.abspath(filename)
a = os.path.exists(pathway+'source.txt')
if(a == False): 
#	print "source.txt不存在"
	f1 = open(pathway+'source.txt','w')
	f4 = open(pathway+'list.txt', 'w')
	for i in content1: 
		f1.write(i)
		f1.write('\n')
	f1.close()
	for j in content2:
		f4.write(j)
		f4.write('\n')
	f4.close()
else:
#	print "source.txt文件已经存在"	
	f2 = open(pathway+'newsource.txt', 'w')
	f3 = open(pathway+'newlist.txt', 'w')
	for i in content1:
		f2.write('%r\n' % (i))
	for j in content2:
		f3.write(j)
		f3.write('\n')
	f2.close() 
#####比较模块######
	def isDiff(srcfile, tarfile):
		'''
		compare with two files,if equal then return ture
		'''
		src = file(srcfile).read().split(' ')
		tar = file(tarfile).read().split(' ')
		ret = 1
    # ignore blank lines
		temp = difflib.SequenceMatcher(lambda x: len(x.strip()) == 0, src, tar)
		for tag, i1, i2, j1, j2 in temp.get_opcodes():
        #print tag
			if tag != 'equal':
				ret = 0
				break
		return (True if ret == 1 else False)
	judge = isDiff(pathway+'source.txt',pathway+'newsource.txt')
	if (judge == True) :
		os.system("notify-send 网站没有更新")
	else:
		os.system("notify-send 网站更新了") 
  		print "是否需要调用uget来下载新货？要启动打yes> ",
		a=raw_input()
		if a == "yes": 
			os.system("/usr/bin/uget-gtk")
		else: 
		   print "好吧，俺休息去了"
	s1 = open(pathway+'source.txt', 'w')
	s2 = open(pathway+'newsource.txt')
	s3 = s2.read()
	s1.write(s3)
	s2.close()
	s1.close()

Ubuntu中文论坛

求一下 LINUX下是否有类似"推乎"这样的软件

求一下 LINUX下是否有类似"推乎"这样的软件

Re: 求一下 LINUX下是否有类似"推乎"这样的软件

Re: 求一下 LINUX下是否有类似"推乎"这样的软件

Re: 求一下 LINUX下是否有类似"推乎"这样的软件

Re: 求一下 LINUX下是否有类似"推乎"这样的软件

Re: 求一下 LINUX下是否有类似"推乎"这样的软件