sicradical/sicradical.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4

import urllib2
import re
import os
import sys
from bs4 import BeautifulSoup

scriptpath = os.path.dirname(os.path.realpath(__file__))
agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36' \
    '(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'

def parsePage(url, progName):
    match = re.search(r"/videos/([\w+|-]+)", url)
    title = match.group(1)

    programpath = scriptpath + "/" + progName
    if os.path.isdir(programpath) is False:
        os.makedirs(programpath)
    destfn = programpath + "/" + title + '.mp4'

    if os.path.isfile(destfn):
        print "- Ja downloadada... a ignorar"
        return False

    headers = { 'User-Agent' : agent }
    req = urllib2.Request(url, None, headers)
    html = urllib2.urlopen(req).read()

    soup = BeautifulSoup(html, "html.parser")

    videourl = soup.find('source')['src']
    if videourl:
        print "- A sacar: " + title
        cmd = 'wget "' + videourl + '" -O "' + destfn + '"'
        os.system(cmd + "> /dev/null 2>&1")
        print "- Done"
        return True

if __name__ == "__main__":
    if len(sys.argv) != 2:
        sys.exit("Correr com "+sys.argv[0]+" [progName]")

    if sys.argv[1]:
        progName = sys.argv[1]

    exists = 0
    offset = ""
    while True:
        url = "http://sicradical.sapo.pt/api/molecule/category/programas/" + progName + "/videos?offset=" + offset

        headers = { 'User-Agent' : agent }
        req = urllib2.Request(url, None, headers)
        html = urllib2.urlopen(req).read()

        soup = BeautifulSoup(html, "html.parser")

        if soup.find('article') is None:
            sys.exit("ultima pagina")

        # apanha todos os items da pagina
        items = soup.findAll('article')

        for item in items:
            if exists >= 5:
                sys.exit("A sair apos 5 falhas, ja devo ter tudo...")

            # url
            link = item.find('a')['href']
            # data
            dt = item.find('p', {'class': 'timeStamp'})['datetime']
            offset = dt

            if parsePage(link, progName) is False:
                exists = exists + 1