diff --git a/rtp.py b/rtp.py old mode 100755 new mode 100644 index 1fc6749..0b29c4e --- a/rtp.py +++ b/rtp.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- # vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4 -from bs4 import BeautifulSoup import urllib2 import re import unicodedata @@ -10,21 +9,22 @@ import os import string import sys import time +from bs4 import BeautifulSoup months = { - 'Jan': '01', - 'Fev': '02', - 'Mar': '03', - 'Abr': '04', - 'Mai': '05', - 'Jun': '06', - 'Jul': '07', - 'Ago': '08', - 'Set': '09', - 'Out': '10', - 'Nov': '11', - 'Dez': '12' - } + 'Jan': '01', + 'Fev': '02', + 'Mar': '03', + 'Abr': '04', + 'Mai': '05', + 'Jun': '06', + 'Jul': '07', + 'Ago': '08', + 'Set': '09', + 'Out': '10', + 'Nov': '11', + 'Dez': '12' +} scriptpath = os.path.dirname(os.path.realpath(__file__)) validFilenameChars = "-_. %s%s" % (string.ascii_letters, string.digits) @@ -33,19 +33,19 @@ def removeDisallowedFilenameChars(filename): cleanedFilename = unicodedata.normalize('NFKD', filename).encode('ASCII', 'ignore') return ''.join(c for c in cleanedFilename if c in validFilenameChars) -def parseRTMP(url,title,progId): +def parseRTMP(url, title, progId): url = 'http://www.rtp.pt' + url match = re.search(r"play/p\d+/(e\d+)/", url) episode_id = match.group(1) programpath = scriptpath+"/"+progId - if os.path.isdir(programpath) == False: + if os.path.isdir(programpath) is False: os.makedirs(programpath) destfn = programpath + "/" + title + "_" + episode_id + '.mp3' page = urllib2.urlopen(url) - match = re.search('"hls_url": "(.+?)",', page.read()) + match = re.search('file: "(.+?)",', page.read()) if match: if os.path.isfile(destfn): print "- Ja downloadada... a ignorar" @@ -56,60 +56,61 @@ def parseRTMP(url,title,progId): print "- Done" return True -if len(sys.argv) != 2: - sys.exit("Correr com "+sys.argv[0]+" [progId]") +if __name__ == "__main__": + if len(sys.argv) != 2: + sys.exit("Correr com "+sys.argv[0]+" [progId]") -if sys.argv[1].isdigit(): - id = sys.argv[1] -else: - sys.exit("progId tem de ser um numero") + if sys.argv[1].isdigit(): + progId = sys.argv[1] + else: + sys.exit("progId tem de ser um numero") -exists = 0 -c = 1 -while True: - print "--- Pagina " + str(c) - url = "http://www.rtp.pt/play/bg_l_ep/?stamp=" + str(int(time.time())) + "&listDate=&listQuery=&listProgram=" + str(id) + "&listcategory=&listchannel=&listtype=recent&page=" + str(c) + "&type=all" + exists = 0 + c = 1 + while True: + print "--- Pagina " + str(c) + url = "http://www.rtp.pt/play/bg_l_ep/?stamp=" + str(int(time.time())) + "&listDate=&listQuery=&listProgram=" + str(progId) + "&listcategory=&listchannel=&listtype=recent&page=" + str(c) + "&type=all" - page = urllib2.urlopen(url) - soup = BeautifulSoup(page.read(), "html.parser") + page = urllib2.urlopen(url) + soup = BeautifulSoup(page.read(), "html.parser") - if (soup.find('div') == None): - sys.exit("ultima pagina") + if soup.find('div') is None: + sys.exit("ultima pagina") - # apanha todos os items da pagina - items = soup.findAll('div',{'class': 'lazy'}) + # apanha todos os items da pagina + items = soup.findAll('div', {'class': 'lazy'}) - for item in items: - if exists >= 5: - sys.exit("A sair apos 5 falhas, ja devo ter tudo...") + for item in items: + if exists >= 5: + sys.exit("A sair apos 5 falhas, ja devo ter tudo...") - # url - link = item.find('a') - # data - dt = item.find('span',{'class': 'small'}).contents[0].strip() - dt = dt.replace(' ', '_') - dt = dt.replace(',', '') + # url + link = item.find('a') + # data + dt = item.find('span', {'class': 'small'}).contents[0].strip() + dt = dt.replace(' ', '_') + dt = dt.replace(',', '') - # mudar para AAAA_MM_DD - match = re.search(r"(\d+)_(\w+)_(\d+)", dt) - if match: - dt = match.group(3) + "_" + months[match.group(2)] + "_" + match.group(1) + # mudar para AAAA_MM_DD + match = re.search(r"(\d+)_(\w+)_(\d+)", dt) + if match: + dt = match.group(3) + "_" + months[match.group(2)] + "_" + match.group(1) - # parte ? - pts = item.findAll('b',{'class': 'text-dark-gray'}) - try: - pt = pts[1].contents[0] - pt = pt.replace('...', '').strip() - pt = pt.replace(' ', '_') - pt = pt.replace('\n','') - except IndexError: - pt = "" + # parte ? + pts = item.findAll('b', {'class': 'text-dark-gray'}) + try: + pt = pts[1].contents[0] + pt = pt.replace('...', '').strip() + pt = pt.replace(' ', '_') + pt = pt.replace('\n', '') + except IndexError: + pt = "" - print "-- " + dt, pt + print "-- " + dt, pt - title = removeDisallowedFilenameChars(dt + "-" + pt) - if parseRTMP(link['href'],title,id) == False: - exists = exists + 1 + title = removeDisallowedFilenameChars(dt + "-" + pt) - c = c + 1 + if parseRTMP(link['href'], title, progId) is False: + exists = exists + 1 + c = c + 1