o site da rtp mudou, adaptar para a nova versao

This commit is contained in:
Pedro de Oliveira 2015-01-27 22:14:02 +00:00
parent 6a1657c794
commit a2e88c1bdd
1 changed files with 19 additions and 19 deletions

38
rtp.py
View File

@ -1,5 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import urllib2 import urllib2
@ -8,6 +9,7 @@ import unicodedata
import os import os
import string import string
import sys import sys
import time
months = { months = {
'Jan': '01', 'Jan': '01',
@ -35,10 +37,10 @@ def parseRTMP(url,title,progId):
url = 'http://www.rtp.pt' + url url = 'http://www.rtp.pt' + url
programpath = scriptpath+"/"+progId programpath = scriptpath+"/"+progId
if os.path.isdir(programpath) == False: if os.path.isdir(programpath) == False:
os.makedirs(programpath) os.makedirs(programpath)
destfn = programpath+"/"+title+'.mp3' destfn = programpath+"/"+title+'.mp3'
page = urllib2.urlopen(url) page = urllib2.urlopen(url)
match = re.search('"file": ".*?//(.*?)"', page.read(), re.MULTILINE) match = re.search('wavrss(.+?)"', page.read())
if match: if match:
if os.path.isfile(destfn): if os.path.isfile(destfn):
print "- Ja downloadada... a ignorar" print "- Ja downloadada... a ignorar"
@ -57,24 +59,19 @@ if sys.argv[1].isdigit():
else: else:
sys.exit("progId tem de ser um numero") sys.exit("progId tem de ser um numero")
# apanhar o numero total de paginas
url = "http://www.rtp.pt/play/browseprog/"+id+"/1/true"
page = urllib2.urlopen(url)
match = re.search(r'<a title="Fim.*?,page:(\d+)\}', page.read(), re.MULTILINE)
if match:
totalpages = match.group(1)
else:
exit
exists = 0 exists = 0
for c in range(1,int(totalpages)): c = 1
while True:
print "--- Pagina " + str(c) print "--- Pagina " + str(c)
url = "http://www.rtp.pt/play/browseprog/"+id+"/"+str(c)+"/" url = "http://www.rtp.pt/play/bg_l_ep/?stamp=" + str(int(time.time())) + "&listDate=&listQuery=&listProgram=" + str(id) + "&listcategory=&listchannel=&listtype=recent&page=" + str(c) + "&type=all"
page = urllib2.urlopen(url) page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read()) soup = BeautifulSoup(page.read())
if (soup.find('div') == None):
sys.exit("ultima pagina")
# apanha todos os items da pagina # apanha todos os items da pagina
items = soup.findAll('div',{'class': 'Elemento'}) items = soup.findAll('div',{'class': 'lazy'})
for item in items: for item in items:
if exists >= 5: if exists >= 5:
@ -82,23 +79,26 @@ for c in range(1,int(totalpages)):
# url # url
link = item.find('a') link = item.find('a')
# data # data
dt = item.find('b').contents[0].strip() dt = item.find('span',{'class': 'small'}).contents[0].strip()
dt = dt.replace(' ', '_') dt = dt.replace(' ', '_')
dt = dt.replace(',', '')
# mudar para AAAA_MM_DD # mudar para AAAA_MM_DD
match = re.search(r"(\d+)_(\w+)_(\d+)", dt) match = re.search(r"(\d+)_(\w+)_(\d+)", dt)
if match: if match:
dt = match.group(3) + "_" + months[match.group(2)] + "_" + match.group(1) dt = match.group(3) + "_" + months[match.group(2)] + "_" + match.group(1)
# parte ? # parte ?
pt = item.find('p').contents[0].strip() pts = item.findAll('b',{'class': 'text-dark-gray'})
pt = pts[1].contents[0]
pt = pt.replace('...', '').strip()
pt = pt.replace(' ', '_') pt = pt.replace(' ', '_')
print "-- " + dt, pt print "-- " + dt, pt
title = removeDisallowedFilenameChars(dt + "-" + pt) title = removeDisallowedFilenameChars(dt + "-" + pt)
if parseRTMP(link['href'],title,id) == False: if parseRTMP(link['href'],title,id) == False:
exists = exists + 1 exists = exists + 1
c = c + 1