first commit

2018-08-07 03:39:34 +01:00 · 2018-08-07 03:39:34 +01:00 · fa9e2f3ba6
commit fa9e2f3ba6
3 changed files with 126 additions and 0 deletions
--- a/23
+++ b/23
@ -0,0 +1,23 @@
+Copyright (c) 2014, Pedro de Oliveira
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/README.md
+++ b/README.md
@ -0,0 +1,26 @@
+sicradical
+==========
+
+SIC Radical program videos downloader
+
+Requirements:
+- Python
+- Beautiful Soup - http://www.crummy.com/software/BeautifulSoup/
+- wget
+
+Features:
+- Downloads the full video archive
+- Can be run in cron to download the latest ones
+- After 5 "already downloaded" files it exits
+- Uses a different directory per progName
+
+Instructions:
+- Extract the "bs4" directory from the Beautiful Soup package to the directory of the script.
+- Run the script with an argument that is the progId to download
+
+Example:
+- The URL for the "VERY TYPICAL" program is http://sicradical.sapo.pt/programas/very-typical so our progName is very-typical.
+- To download it just do:
+```
+./sicradical.py very-typical
+```
--- a/sicradical.py
+++ b/sicradical.py
@ -0,0 +1,77 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4
+
+import urllib2
+import re
+import os
+import sys
+from bs4 import BeautifulSoup
+
+scriptpath = os.path.dirname(os.path.realpath(__file__))
+agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36' \
+    '(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
+
+def parsePage(url, progName):
+    match = re.search(r"/videos/([\w+|-]+)", url)
+    title = match.group(1)
+
+    programpath = scriptpath + "/" + progName
+    if os.path.isdir(programpath) is False:
+        os.makedirs(programpath)
+    destfn = programpath + "/" + title + '.mp4'
+
+    if os.path.isfile(destfn):
+        print "- Ja downloadada... a ignorar"
+        return False
+
+    headers = { 'User-Agent' : agent }
+    req = urllib2.Request(url, None, headers)
+    html = urllib2.urlopen(req).read()
+
+    soup = BeautifulSoup(html, "html.parser")
+
+    videourl = soup.find('source')['src']
+    if videourl:
+        print "- A sacar: " + title
+        cmd = 'wget "' + videourl + '" -O "' + destfn + '"'
+        os.system(cmd + "> /dev/null 2>&1")
+        print "- Done"
+        return True
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        sys.exit("Correr com "+sys.argv[0]+" [progName]")
+
+    if sys.argv[1]:
+        progName = sys.argv[1]
+
+    exists = 0
+    offset = ""
+    while True:
+        url = "http://sicradical.sapo.pt/api/molecule/category/programas/" + progName + "/videos?offset=" + offset
+
+        headers = { 'User-Agent' : agent }
+        req = urllib2.Request(url, None, headers)
+        html = urllib2.urlopen(req).read()
+
+        soup = BeautifulSoup(html, "html.parser")
+
+        if soup.find('article') is None:
+            sys.exit("ultima pagina")
+
+        # apanha todos os items da pagina
+        items = soup.findAll('article')
+
+        for item in items:
+            if exists >= 5:
+                sys.exit("A sair apos 5 falhas, ja devo ter tudo...")
+
+            # url
+            link = item.find('a')['href']
+            # data
+            dt = item.find('p', {'class': 'timeStamp'})['datetime']
+            offset = dt
+
+            if parsePage(link, progName) is False:
+                exists = exists + 1