RTP changed the site again, fixed.

Also used some better practices on the code.
This commit is contained in:
Pedro de Oliveira 2017-01-29 16:40:45 +00:00
parent 8b9d315dd0
commit ef48fac7df
1 changed files with 62 additions and 61 deletions

123
rtp.py Executable file → Normal file
View File

@ -2,7 +2,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4 # vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4
from bs4 import BeautifulSoup
import urllib2 import urllib2
import re import re
import unicodedata import unicodedata
@ -10,21 +9,22 @@ import os
import string import string
import sys import sys
import time import time
from bs4 import BeautifulSoup
months = { months = {
'Jan': '01', 'Jan': '01',
'Fev': '02', 'Fev': '02',
'Mar': '03', 'Mar': '03',
'Abr': '04', 'Abr': '04',
'Mai': '05', 'Mai': '05',
'Jun': '06', 'Jun': '06',
'Jul': '07', 'Jul': '07',
'Ago': '08', 'Ago': '08',
'Set': '09', 'Set': '09',
'Out': '10', 'Out': '10',
'Nov': '11', 'Nov': '11',
'Dez': '12' 'Dez': '12'
} }
scriptpath = os.path.dirname(os.path.realpath(__file__)) scriptpath = os.path.dirname(os.path.realpath(__file__))
validFilenameChars = "-_. %s%s" % (string.ascii_letters, string.digits) validFilenameChars = "-_. %s%s" % (string.ascii_letters, string.digits)
@ -33,19 +33,19 @@ def removeDisallowedFilenameChars(filename):
cleanedFilename = unicodedata.normalize('NFKD', filename).encode('ASCII', 'ignore') cleanedFilename = unicodedata.normalize('NFKD', filename).encode('ASCII', 'ignore')
return ''.join(c for c in cleanedFilename if c in validFilenameChars) return ''.join(c for c in cleanedFilename if c in validFilenameChars)
def parseRTMP(url,title,progId): def parseRTMP(url, title, progId):
url = 'http://www.rtp.pt' + url url = 'http://www.rtp.pt' + url
match = re.search(r"play/p\d+/(e\d+)/", url) match = re.search(r"play/p\d+/(e\d+)/", url)
episode_id = match.group(1) episode_id = match.group(1)
programpath = scriptpath+"/"+progId programpath = scriptpath+"/"+progId
if os.path.isdir(programpath) == False: if os.path.isdir(programpath) is False:
os.makedirs(programpath) os.makedirs(programpath)
destfn = programpath + "/" + title + "_" + episode_id + '.mp3' destfn = programpath + "/" + title + "_" + episode_id + '.mp3'
page = urllib2.urlopen(url) page = urllib2.urlopen(url)
match = re.search('"hls_url": "(.+?)",', page.read()) match = re.search('file: "(.+?)",', page.read())
if match: if match:
if os.path.isfile(destfn): if os.path.isfile(destfn):
print "- Ja downloadada... a ignorar" print "- Ja downloadada... a ignorar"
@ -56,60 +56,61 @@ def parseRTMP(url,title,progId):
print "- Done" print "- Done"
return True return True
if len(sys.argv) != 2: if __name__ == "__main__":
sys.exit("Correr com "+sys.argv[0]+" [progId]") if len(sys.argv) != 2:
sys.exit("Correr com "+sys.argv[0]+" [progId]")
if sys.argv[1].isdigit(): if sys.argv[1].isdigit():
id = sys.argv[1] progId = sys.argv[1]
else: else:
sys.exit("progId tem de ser um numero") sys.exit("progId tem de ser um numero")
exists = 0 exists = 0
c = 1 c = 1
while True: while True:
print "--- Pagina " + str(c) print "--- Pagina " + str(c)
url = "http://www.rtp.pt/play/bg_l_ep/?stamp=" + str(int(time.time())) + "&listDate=&listQuery=&listProgram=" + str(id) + "&listcategory=&listchannel=&listtype=recent&page=" + str(c) + "&type=all" url = "http://www.rtp.pt/play/bg_l_ep/?stamp=" + str(int(time.time())) + "&listDate=&listQuery=&listProgram=" + str(progId) + "&listcategory=&listchannel=&listtype=recent&page=" + str(c) + "&type=all"
page = urllib2.urlopen(url) page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read(), "html.parser") soup = BeautifulSoup(page.read(), "html.parser")
if (soup.find('div') == None): if soup.find('div') is None:
sys.exit("ultima pagina") sys.exit("ultima pagina")
# apanha todos os items da pagina # apanha todos os items da pagina
items = soup.findAll('div',{'class': 'lazy'}) items = soup.findAll('div', {'class': 'lazy'})
for item in items: for item in items:
if exists >= 5: if exists >= 5:
sys.exit("A sair apos 5 falhas, ja devo ter tudo...") sys.exit("A sair apos 5 falhas, ja devo ter tudo...")
# url # url
link = item.find('a') link = item.find('a')
# data # data
dt = item.find('span',{'class': 'small'}).contents[0].strip() dt = item.find('span', {'class': 'small'}).contents[0].strip()
dt = dt.replace(' ', '_') dt = dt.replace(' ', '_')
dt = dt.replace(',', '') dt = dt.replace(',', '')
# mudar para AAAA_MM_DD # mudar para AAAA_MM_DD
match = re.search(r"(\d+)_(\w+)_(\d+)", dt) match = re.search(r"(\d+)_(\w+)_(\d+)", dt)
if match: if match:
dt = match.group(3) + "_" + months[match.group(2)] + "_" + match.group(1) dt = match.group(3) + "_" + months[match.group(2)] + "_" + match.group(1)
# parte ? # parte ?
pts = item.findAll('b',{'class': 'text-dark-gray'}) pts = item.findAll('b', {'class': 'text-dark-gray'})
try: try:
pt = pts[1].contents[0] pt = pts[1].contents[0]
pt = pt.replace('...', '').strip() pt = pt.replace('...', '').strip()
pt = pt.replace(' ', '_') pt = pt.replace(' ', '_')
pt = pt.replace('\n','') pt = pt.replace('\n', '')
except IndexError: except IndexError:
pt = "" pt = ""
print "-- " + dt, pt print "-- " + dt, pt
title = removeDisallowedFilenameChars(dt + "-" + pt) title = removeDisallowedFilenameChars(dt + "-" + pt)
if parseRTMP(link['href'],title,id) == False:
exists = exists + 1
c = c + 1 if parseRTMP(link['href'], title, progId) is False:
exists = exists + 1
c = c + 1