mirror of https://github.com/falsovsky/RTPapd.git
95 lines
3.0 KiB
Python
Executable File
95 lines
3.0 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
|
|
import re
|
|
import os
|
|
import sys
|
|
import string
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
def fix_filename(filename):
|
|
filename = filename.replace(' ', '_')
|
|
safechars = bytearray(('_-.()' + string.digits + string.ascii_letters).encode())
|
|
allchars = bytearray(range(0x100))
|
|
deletechars = bytearray(set(allchars) - set(safechars))
|
|
return filename.encode('ascii', 'ignore').translate(None, deletechars).decode()
|
|
|
|
def parse_episodes(progId):
|
|
page = 1
|
|
while True:
|
|
url = "https://www.rtp.pt/play/bg_l_ep/?listProgram={}&page={}".format(progId, page)
|
|
print("Scraping Page {} ({})".format(page, url))
|
|
response = requests.get(
|
|
url,
|
|
headers={
|
|
'User-agent': 'Mozilla/5.0',
|
|
'Cookie': 'rtp_cookie_parental=0; rtp_privacy=666; rtp_cookie_privacy=permit 1,2,3,4; googlepersonalization=1; _recid='
|
|
}
|
|
)
|
|
soup = BeautifulSoup(response.content, "html.parser")
|
|
|
|
if soup.find('article') is None:
|
|
sys.exit("No more pages.")
|
|
|
|
for article in soup.find_all('article'):
|
|
url = article.find('a')['href']
|
|
episode_date = article.find('span', {'class': 'episode-date'})
|
|
episode_title = article.find('h4', {'class': 'episode-title'})
|
|
yield {
|
|
'url': "https://rtp.pt{}".format(url),
|
|
'filename': fix_filename(
|
|
"{}-{}.mp3".format(
|
|
episode_date.text.strip(),
|
|
episode_title.text.strip() if episode_title else ''
|
|
)
|
|
)
|
|
}
|
|
page += 1
|
|
|
|
def download_episode(episode, local_file):
|
|
response = requests.get(
|
|
episode['url'],
|
|
headers={
|
|
'User-agent': 'Mozilla/5.0',
|
|
'Cookie': 'rtp_cookie_parental=0; rtp_privacy=666; rtp_cookie_privacy=permit 1,2,3,4; googlepersonalization=1; _recid='
|
|
}
|
|
)
|
|
file_url = re.search(r"f = \"(.*?)\"", response.text)
|
|
if file_url:
|
|
cmd = "wget \"{}\" -O \"{}\" > /dev/null 2>&1".format(
|
|
file_url.group(1),
|
|
local_file
|
|
)
|
|
print("Downloading {} ...".format(local_file))
|
|
os.system(cmd)
|
|
print("Done.")
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) != 2:
|
|
sys.exit("Run with {} [progId]".format(sys.argv[0]))
|
|
|
|
if sys.argv[1].isdigit():
|
|
progId = sys.argv[1]
|
|
else:
|
|
sys.exit("progId must be numeric")
|
|
|
|
script_path = os.path.dirname(os.path.realpath(__file__))
|
|
directory = "{}/{}".format(script_path, progId)
|
|
if os.path.isdir(directory) is False:
|
|
os.makedirs(directory)
|
|
|
|
failed = 0
|
|
for episode in parse_episodes(progId):
|
|
if failed >= 5:
|
|
sys.exit("Already have 5 files...")
|
|
local_file = "{}/{}".format(
|
|
directory,
|
|
episode['filename']
|
|
)
|
|
if os.path.isfile(local_file):
|
|
failed += 1
|
|
continue
|
|
download_episode(episode, local_file)
|