This commit is contained in:
Pedro de Oliveira 2021-05-20 01:21:20 +01:00
parent ec83c9b7ed
commit b780f48282
Signed by: falso
GPG Key ID: 1E4F05ACDBB2C85C
2 changed files with 73 additions and 95 deletions

View File

@ -5,6 +5,7 @@ RTP audio podcasts downloader
Requirements: Requirements:
- Python - Python
- Requests - https://docs.python-requests.org/en/master/
- Beautiful Soup - http://www.crummy.com/software/BeautifulSoup/ - Beautiful Soup - http://www.crummy.com/software/BeautifulSoup/
- wget - wget
@ -15,7 +16,6 @@ Features:
- Uses a different directory per progId - Uses a different directory per progId
Instructions: Instructions:
- Extract the "bs4" directory from the Beautiful Soup package to the directory of the script.
- Run the script with an argument that is the progId to download - Run the script with an argument that is the progId to download
Example: Example:

166
rtp.py
View File

@ -1,116 +1,94 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4
import urllib2
import re import re
import unicodedata
import os import os
import string
import sys import sys
import time import string
import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
months = {
'Jan': '01',
'Fev': '02',
'Mar': '03',
'Abr': '04',
'Mai': '05',
'Jun': '06',
'Jul': '07',
'Ago': '08',
'Set': '09',
'Out': '10',
'Nov': '11',
'Dez': '12'
}
scriptpath = os.path.dirname(os.path.realpath(__file__)) def fix_filename(filename):
validFilenameChars = "-_. %s%s" % (string.ascii_letters, string.digits) filename = filename.replace(' ', '_')
safechars = bytearray(('_-.()' + string.digits + string.ascii_letters).encode())
allchars = bytearray(range(0x100))
deletechars = bytearray(set(allchars) - set(safechars))
return filename.encode('ascii', 'ignore').translate(None, deletechars).decode()
def removeDisallowedFilenameChars(filename): def parse_episodes(progId):
cleanedFilename = unicodedata.normalize('NFKD', filename).encode('ASCII', 'ignore') page = 1
return ''.join(c for c in cleanedFilename if c in validFilenameChars) while True:
url = "https://www.rtp.pt/play/bg_l_ep/?listProgram={}&page={}".format(progId, page)
print("Scraping Page {} ({})".format(page, url))
response = requests.get(
url,
headers={
'User-agent': 'Mozilla/5.0',
'Cookie': 'rtp_cookie_parental=0; rtp_privacy=666; rtp_cookie_privacy=permit 1,2,3,4; googlepersonalization=1; _recid='
}
)
soup = BeautifulSoup(response.content, "html.parser")
def parseRTMP(url, title, progId): if soup.find('article') is None:
url = 'http://www.rtp.pt' + url sys.exit("No more pages.")
match = re.search(r"play/p\d+/(e\d+)/", url) for article in soup.find_all('article'):
episode_id = match.group(1) url = article.find('a')['href']
episode_date = article.find('span', {'class': 'episode-date'})
episode_title = article.find('h4', {'class': 'episode-title'})
yield {
'url': "https://rtp.pt{}".format(url),
'filename': fix_filename(
"{}-{}.mp3".format(
episode_date.text.strip(),
episode_title.text.strip() if episode_title else ''
)
)
}
page += 1
programpath = scriptpath+"/"+progId def download_episode(episode, local_file):
if os.path.isdir(programpath) is False: response = requests.get(
os.makedirs(programpath) episode['url'],
destfn = programpath + "/" + title + "_" + episode_id + '.mp3' headers={
page = urllib2.urlopen(url) 'User-agent': 'Mozilla/5.0',
'Cookie': 'rtp_cookie_parental=0; rtp_privacy=666; rtp_cookie_privacy=permit 1,2,3,4; googlepersonalization=1; _recid='
match = re.search('file: "(.+?)",', page.read()) }
if match: )
if os.path.isfile(destfn): file_url = re.search(r"f = \"(.*?)\"", response.text)
print "- Ja downloadada... a ignorar" if file_url:
return False cmd = "wget \"{}\" -O \"{}\" > /dev/null 2>&1".format(
print "- A sacar..." file_url.group(1),
cmd = 'wget "' + match.group(1) + '" -O "' + destfn + '"' local_file
os.system(cmd + "> /dev/null 2>&1") )
print "- Done" print("Downloading {} ...".format(local_file))
return True os.system(cmd)
print("Done.")
if __name__ == "__main__": if __name__ == "__main__":
if len(sys.argv) != 2: if len(sys.argv) != 2:
sys.exit("Correr com "+sys.argv[0]+" [progId]") sys.exit("Run with {} [progId]".format(sys.argv[0]))
if sys.argv[1].isdigit(): if sys.argv[1].isdigit():
progId = sys.argv[1] progId = sys.argv[1]
else: else:
sys.exit("progId tem de ser um numero") sys.exit("progId must be numeric")
exists = 0 script_path = os.path.dirname(os.path.realpath(__file__))
c = 1 directory = "{}/{}".format(script_path, progId)
while True: if os.path.isdir(directory) is False:
print "--- Pagina " + str(c) os.makedirs(directory)
url = "http://www.rtp.pt/play/bg_l_ep/?stamp=" + str(int(time.time())) + "&listDate=&listQuery=&listProgram=" + str(progId) + "&listcategory=&listchannel=&listtype=recent&page=" + str(c) + "&type=all"
page = urllib2.urlopen(url) failed = 0
soup = BeautifulSoup(page.read(), "html.parser") for episode in parse_episodes(progId):
if failed >= 5:
if soup.find('div') is None: sys.exit("Already have 5 files...")
sys.exit("ultima pagina") local_file = "{}/{}".format(
directory,
# apanha todos os items da pagina episode['filename']
items = soup.findAll('div', {'class': 'lazy'}) )
if os.path.isfile(local_file):
for item in items: failed += 1
if exists >= 5: continue
sys.exit("A sair apos 5 falhas, ja devo ter tudo...") download_episode(episode, local_file)
# url
link = item.find('a')
# data
dt = item.find('span', {'class': 'small'}).contents[0].strip()
dt = dt.replace(' ', '_')
dt = dt.replace(',', '')
# mudar para AAAA_MM_DD
match = re.search(r"(\d+)_(\w+)_(\d+)", dt)
if match:
dt = match.group(3) + "_" + months[match.group(2)] + "_" + match.group(1)
# parte ?
pts = item.findAll('b', {'class': 'text-dark-gray'})
try:
pt = pts[1].contents[0]
pt = pt.replace('...', '').strip()
pt = pt.replace(' ', '_')
pt = pt.replace('\n', '')
except IndexError:
pt = ""
print "-- " + dt, pt
title = removeDisallowedFilenameChars(dt + "-" + pt)
if parseRTMP(link['href'], title, progId) is False:
exists = exists + 1
c = c + 1