commit fa9e2f3ba6d18843f613e22364f6576ba04fc0f2 Author: Pedro de Oliveira Date: Tue Aug 7 03:39:34 2018 +0100 first commit diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..1d6cd50 --- /dev/null +++ b/LICENSE @@ -0,0 +1,23 @@ +Copyright (c) 2014, Pedro de Oliveira +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..d31aeb8 --- /dev/null +++ b/README.md @@ -0,0 +1,26 @@ +sicradical +========== + +SIC Radical program videos downloader + +Requirements: +- Python +- Beautiful Soup - http://www.crummy.com/software/BeautifulSoup/ +- wget + +Features: +- Downloads the full video archive +- Can be run in cron to download the latest ones +- After 5 "already downloaded" files it exits +- Uses a different directory per progName + +Instructions: +- Extract the "bs4" directory from the Beautiful Soup package to the directory of the script. +- Run the script with an argument that is the progId to download + +Example: +- The URL for the "VERY TYPICAL" program is http://sicradical.sapo.pt/programas/very-typical so our progName is very-typical. +- To download it just do: +``` +./sicradical.py very-typical +``` diff --git a/sicradical.py b/sicradical.py new file mode 100644 index 0000000..a9d34db --- /dev/null +++ b/sicradical.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4 + +import urllib2 +import re +import os +import sys +from bs4 import BeautifulSoup + +scriptpath = os.path.dirname(os.path.realpath(__file__)) +agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36' \ + '(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' + +def parsePage(url, progName): + match = re.search(r"/videos/([\w+|-]+)", url) + title = match.group(1) + + programpath = scriptpath + "/" + progName + if os.path.isdir(programpath) is False: + os.makedirs(programpath) + destfn = programpath + "/" + title + '.mp4' + + if os.path.isfile(destfn): + print "- Ja downloadada... a ignorar" + return False + + headers = { 'User-Agent' : agent } + req = urllib2.Request(url, None, headers) + html = urllib2.urlopen(req).read() + + soup = BeautifulSoup(html, "html.parser") + + videourl = soup.find('source')['src'] + if videourl: + print "- A sacar: " + title + cmd = 'wget "' + videourl + '" -O "' + destfn + '"' + os.system(cmd + "> /dev/null 2>&1") + print "- Done" + return True + +if __name__ == "__main__": + if len(sys.argv) != 2: + sys.exit("Correr com "+sys.argv[0]+" [progName]") + + if sys.argv[1]: + progName = sys.argv[1] + + exists = 0 + offset = "" + while True: + url = "http://sicradical.sapo.pt/api/molecule/category/programas/" + progName + "/videos?offset=" + offset + + headers = { 'User-Agent' : agent } + req = urllib2.Request(url, None, headers) + html = urllib2.urlopen(req).read() + + soup = BeautifulSoup(html, "html.parser") + + if soup.find('article') is None: + sys.exit("ultima pagina") + + # apanha todos os items da pagina + items = soup.findAll('article') + + for item in items: + if exists >= 5: + sys.exit("A sair apos 5 falhas, ja devo ter tudo...") + + # url + link = item.find('a')['href'] + # data + dt = item.find('p', {'class': 'timeStamp'})['datetime'] + offset = dt + + if parsePage(link, progName) is False: + exists = exists + 1