first commit

2018-08-07 03:39:34 +01:00 · 2018-08-07 03:39:34 +01:00 · fa9e2f3ba6
commit fa9e2f3ba6
3 changed files with 126 additions and 0 deletions
--- a/23
+++ b/23
@ -0,0 +1,23 @@
 Copyright (c) 2014, Pedro de Oliveira
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 * Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.
 * Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/README.md
+++ b/README.md
@ -0,0 +1,26 @@
 sicradical
 ==========
 SIC Radical program videos downloader
 Requirements:
 - Python
 - Beautiful Soup - http://www.crummy.com/software/BeautifulSoup/
 - wget
 Features:
 - Downloads the full video archive
 - Can be run in cron to download the latest ones
 - After 5 "already downloaded" files it exits
 - Uses a different directory per progName
 Instructions:
 - Extract the "bs4" directory from the Beautiful Soup package to the directory of the script.
 - Run the script with an argument that is the progId to download
 Example:
 - The URL for the "VERY TYPICAL" program is http://sicradical.sapo.pt/programas/very-typical so our progName is very-typical.
 - To download it just do:
 ```
 ./sicradical.py very-typical
 ```
--- a/sicradical.py
+++ b/sicradical.py
@ -0,0 +1,77 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4
 import urllib2
 import re
 import os
 import sys
 from bs4 import BeautifulSoup
 scriptpath = os.path.dirname(os.path.realpath(__file__))
 agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36' \
    '(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
 def parsePage(url, progName):
    match = re.search(r"/videos/([\w+|-]+)", url)
    title = match.group(1)
    programpath = scriptpath + "/" + progName
    if os.path.isdir(programpath) is False:
        os.makedirs(programpath)
    destfn = programpath + "/" + title + '.mp4'
    if os.path.isfile(destfn):
        print "- Ja downloadada... a ignorar"
        return False
    headers = { 'User-Agent' : agent }
    req = urllib2.Request(url, None, headers)
    html = urllib2.urlopen(req).read()
    soup = BeautifulSoup(html, "html.parser")
    videourl = soup.find('source')['src']
    if videourl:
        print "- A sacar: " + title
        cmd = 'wget "' + videourl + '" -O "' + destfn + '"'
        os.system(cmd + "> /dev/null 2>&1")
        print "- Done"
        return True
 if __name__ == "__main__":
    if len(sys.argv) != 2:
        sys.exit("Correr com "+sys.argv[0]+" [progName]")
    if sys.argv[1]:
        progName = sys.argv[1]
    exists = 0
    offset = ""
    while True:
        url = "http://sicradical.sapo.pt/api/molecule/category/programas/" + progName + "/videos?offset=" + offset
        headers = { 'User-Agent' : agent }
        req = urllib2.Request(url, None, headers)
        html = urllib2.urlopen(req).read()
        soup = BeautifulSoup(html, "html.parser")
        if soup.find('article') is None:
            sys.exit("ultima pagina")
        # apanha todos os items da pagina
        items = soup.findAll('article')
        for item in items:
            if exists >= 5:
                sys.exit("A sair apos 5 falhas, ja devo ter tudo...")
            # url
            link = item.find('a')['href']
            # data
            dt = item.find('p', {'class': 'timeStamp'})['datetime']
            offset = dt
            if parsePage(link, progName) is False:
                exists = exists + 1