mirror of https://github.com/falsovsky/RTPapd.git
Rewrite
This commit is contained in:
parent
ec83c9b7ed
commit
b780f48282
|
@ -5,6 +5,7 @@ RTP audio podcasts downloader
|
||||||
|
|
||||||
Requirements:
|
Requirements:
|
||||||
- Python
|
- Python
|
||||||
|
- Requests - https://docs.python-requests.org/en/master/
|
||||||
- Beautiful Soup - http://www.crummy.com/software/BeautifulSoup/
|
- Beautiful Soup - http://www.crummy.com/software/BeautifulSoup/
|
||||||
- wget
|
- wget
|
||||||
|
|
||||||
|
@ -15,7 +16,6 @@ Features:
|
||||||
- Uses a different directory per progId
|
- Uses a different directory per progId
|
||||||
|
|
||||||
Instructions:
|
Instructions:
|
||||||
- Extract the "bs4" directory from the Beautiful Soup package to the directory of the script.
|
|
||||||
- Run the script with an argument that is the progId to download
|
- Run the script with an argument that is the progId to download
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
166
rtp.py
166
rtp.py
|
@ -1,116 +1,94 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4
|
|
||||||
|
|
||||||
import urllib2
|
|
||||||
import re
|
import re
|
||||||
import unicodedata
|
|
||||||
import os
|
import os
|
||||||
import string
|
|
||||||
import sys
|
import sys
|
||||||
import time
|
import string
|
||||||
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
months = {
|
|
||||||
'Jan': '01',
|
|
||||||
'Fev': '02',
|
|
||||||
'Mar': '03',
|
|
||||||
'Abr': '04',
|
|
||||||
'Mai': '05',
|
|
||||||
'Jun': '06',
|
|
||||||
'Jul': '07',
|
|
||||||
'Ago': '08',
|
|
||||||
'Set': '09',
|
|
||||||
'Out': '10',
|
|
||||||
'Nov': '11',
|
|
||||||
'Dez': '12'
|
|
||||||
}
|
|
||||||
|
|
||||||
scriptpath = os.path.dirname(os.path.realpath(__file__))
|
def fix_filename(filename):
|
||||||
validFilenameChars = "-_. %s%s" % (string.ascii_letters, string.digits)
|
filename = filename.replace(' ', '_')
|
||||||
|
safechars = bytearray(('_-.()' + string.digits + string.ascii_letters).encode())
|
||||||
|
allchars = bytearray(range(0x100))
|
||||||
|
deletechars = bytearray(set(allchars) - set(safechars))
|
||||||
|
return filename.encode('ascii', 'ignore').translate(None, deletechars).decode()
|
||||||
|
|
||||||
def removeDisallowedFilenameChars(filename):
|
def parse_episodes(progId):
|
||||||
cleanedFilename = unicodedata.normalize('NFKD', filename).encode('ASCII', 'ignore')
|
page = 1
|
||||||
return ''.join(c for c in cleanedFilename if c in validFilenameChars)
|
while True:
|
||||||
|
url = "https://www.rtp.pt/play/bg_l_ep/?listProgram={}&page={}".format(progId, page)
|
||||||
|
print("Scraping Page {} ({})".format(page, url))
|
||||||
|
response = requests.get(
|
||||||
|
url,
|
||||||
|
headers={
|
||||||
|
'User-agent': 'Mozilla/5.0',
|
||||||
|
'Cookie': 'rtp_cookie_parental=0; rtp_privacy=666; rtp_cookie_privacy=permit 1,2,3,4; googlepersonalization=1; _recid='
|
||||||
|
}
|
||||||
|
)
|
||||||
|
soup = BeautifulSoup(response.content, "html.parser")
|
||||||
|
|
||||||
def parseRTMP(url, title, progId):
|
if soup.find('article') is None:
|
||||||
url = 'http://www.rtp.pt' + url
|
sys.exit("No more pages.")
|
||||||
|
|
||||||
match = re.search(r"play/p\d+/(e\d+)/", url)
|
for article in soup.find_all('article'):
|
||||||
episode_id = match.group(1)
|
url = article.find('a')['href']
|
||||||
|
episode_date = article.find('span', {'class': 'episode-date'})
|
||||||
|
episode_title = article.find('h4', {'class': 'episode-title'})
|
||||||
|
yield {
|
||||||
|
'url': "https://rtp.pt{}".format(url),
|
||||||
|
'filename': fix_filename(
|
||||||
|
"{}-{}.mp3".format(
|
||||||
|
episode_date.text.strip(),
|
||||||
|
episode_title.text.strip() if episode_title else ''
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
page += 1
|
||||||
|
|
||||||
programpath = scriptpath+"/"+progId
|
def download_episode(episode, local_file):
|
||||||
if os.path.isdir(programpath) is False:
|
response = requests.get(
|
||||||
os.makedirs(programpath)
|
episode['url'],
|
||||||
destfn = programpath + "/" + title + "_" + episode_id + '.mp3'
|
headers={
|
||||||
page = urllib2.urlopen(url)
|
'User-agent': 'Mozilla/5.0',
|
||||||
|
'Cookie': 'rtp_cookie_parental=0; rtp_privacy=666; rtp_cookie_privacy=permit 1,2,3,4; googlepersonalization=1; _recid='
|
||||||
match = re.search('file: "(.+?)",', page.read())
|
}
|
||||||
if match:
|
)
|
||||||
if os.path.isfile(destfn):
|
file_url = re.search(r"f = \"(.*?)\"", response.text)
|
||||||
print "- Ja downloadada... a ignorar"
|
if file_url:
|
||||||
return False
|
cmd = "wget \"{}\" -O \"{}\" > /dev/null 2>&1".format(
|
||||||
print "- A sacar..."
|
file_url.group(1),
|
||||||
cmd = 'wget "' + match.group(1) + '" -O "' + destfn + '"'
|
local_file
|
||||||
os.system(cmd + "> /dev/null 2>&1")
|
)
|
||||||
print "- Done"
|
print("Downloading {} ...".format(local_file))
|
||||||
return True
|
os.system(cmd)
|
||||||
|
print("Done.")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
if len(sys.argv) != 2:
|
if len(sys.argv) != 2:
|
||||||
sys.exit("Correr com "+sys.argv[0]+" [progId]")
|
sys.exit("Run with {} [progId]".format(sys.argv[0]))
|
||||||
|
|
||||||
if sys.argv[1].isdigit():
|
if sys.argv[1].isdigit():
|
||||||
progId = sys.argv[1]
|
progId = sys.argv[1]
|
||||||
else:
|
else:
|
||||||
sys.exit("progId tem de ser um numero")
|
sys.exit("progId must be numeric")
|
||||||
|
|
||||||
exists = 0
|
script_path = os.path.dirname(os.path.realpath(__file__))
|
||||||
c = 1
|
directory = "{}/{}".format(script_path, progId)
|
||||||
while True:
|
if os.path.isdir(directory) is False:
|
||||||
print "--- Pagina " + str(c)
|
os.makedirs(directory)
|
||||||
url = "http://www.rtp.pt/play/bg_l_ep/?stamp=" + str(int(time.time())) + "&listDate=&listQuery=&listProgram=" + str(progId) + "&listcategory=&listchannel=&listtype=recent&page=" + str(c) + "&type=all"
|
|
||||||
|
|
||||||
page = urllib2.urlopen(url)
|
failed = 0
|
||||||
soup = BeautifulSoup(page.read(), "html.parser")
|
for episode in parse_episodes(progId):
|
||||||
|
if failed >= 5:
|
||||||
if soup.find('div') is None:
|
sys.exit("Already have 5 files...")
|
||||||
sys.exit("ultima pagina")
|
local_file = "{}/{}".format(
|
||||||
|
directory,
|
||||||
# apanha todos os items da pagina
|
episode['filename']
|
||||||
items = soup.findAll('div', {'class': 'lazy'})
|
)
|
||||||
|
if os.path.isfile(local_file):
|
||||||
for item in items:
|
failed += 1
|
||||||
if exists >= 5:
|
continue
|
||||||
sys.exit("A sair apos 5 falhas, ja devo ter tudo...")
|
download_episode(episode, local_file)
|
||||||
|
|
||||||
# url
|
|
||||||
link = item.find('a')
|
|
||||||
# data
|
|
||||||
dt = item.find('span', {'class': 'small'}).contents[0].strip()
|
|
||||||
dt = dt.replace(' ', '_')
|
|
||||||
dt = dt.replace(',', '')
|
|
||||||
|
|
||||||
# mudar para AAAA_MM_DD
|
|
||||||
match = re.search(r"(\d+)_(\w+)_(\d+)", dt)
|
|
||||||
if match:
|
|
||||||
dt = match.group(3) + "_" + months[match.group(2)] + "_" + match.group(1)
|
|
||||||
|
|
||||||
# parte ?
|
|
||||||
pts = item.findAll('b', {'class': 'text-dark-gray'})
|
|
||||||
try:
|
|
||||||
pt = pts[1].contents[0]
|
|
||||||
pt = pt.replace('...', '').strip()
|
|
||||||
pt = pt.replace(' ', '_')
|
|
||||||
pt = pt.replace('\n', '')
|
|
||||||
except IndexError:
|
|
||||||
pt = ""
|
|
||||||
|
|
||||||
print "-- " + dt, pt
|
|
||||||
|
|
||||||
title = removeDisallowedFilenameChars(dt + "-" + pt)
|
|
||||||
|
|
||||||
if parseRTMP(link['href'], title, progId) is False:
|
|
||||||
exists = exists + 1
|
|
||||||
|
|
||||||
c = c + 1
|
|
||||||
|
|
Loading…
Reference in New Issue