#!/usr/bin/env python # -*- coding: utf-8 -*- # vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4 import urllib2 import re import os import sys from bs4 import BeautifulSoup scriptpath = os.path.dirname(os.path.realpath(__file__)) agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36' \ '(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' def parsePage(url, progName): match = re.search(r"/videos/([\w+|-]+)", url) title = match.group(1) programpath = scriptpath + "/" + progName if os.path.isdir(programpath) is False: os.makedirs(programpath) destfn = programpath + "/" + title + '.mp4' if os.path.isfile(destfn): print "- Ja downloadada... a ignorar" return False headers = { 'User-Agent' : agent } req = urllib2.Request(url, None, headers) html = urllib2.urlopen(req).read() soup = BeautifulSoup(html, "html.parser") videourl = soup.find('source')['src'] if videourl: print "- A sacar: " + title cmd = 'wget "' + videourl + '" -O "' + destfn + '"' os.system(cmd + "> /dev/null 2>&1") print "- Done" return True if __name__ == "__main__": if len(sys.argv) != 2: sys.exit("Correr com "+sys.argv[0]+" [progName]") if sys.argv[1]: progName = sys.argv[1] exists = 0 offset = "" while True: url = "http://sicradical.sapo.pt/api/molecule/category/programas/" + progName + "/videos?offset=" + offset headers = { 'User-Agent' : agent } req = urllib2.Request(url, None, headers) html = urllib2.urlopen(req).read() soup = BeautifulSoup(html, "html.parser") if soup.find('article') is None: sys.exit("ultima pagina") # apanha todos os items da pagina items = soup.findAll('article') for item in items: if exists >= 5: sys.exit("A sair apos 5 falhas, ja devo ter tudo...") # url link = item.find('a')['href'] # data dt = item.find('p', {'class': 'timeStamp'})['datetime'] offset = dt if parsePage(link, progName) is False: exists = exists + 1