sicradical/sicradical.py

78 lines
2.2 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4
import urllib2
import re
import os
import sys
from bs4 import BeautifulSoup
scriptpath = os.path.dirname(os.path.realpath(__file__))
agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36' \
'(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
def parsePage(url, progName):
match = re.search(r"/videos/([\w+|-]+)", url)
title = match.group(1)
programpath = scriptpath + "/" + progName
if os.path.isdir(programpath) is False:
os.makedirs(programpath)
destfn = programpath + "/" + title + '.mp4'
if os.path.isfile(destfn):
print "- Ja downloadada... a ignorar"
return False
headers = { 'User-Agent' : agent }
req = urllib2.Request(url, None, headers)
html = urllib2.urlopen(req).read()
soup = BeautifulSoup(html, "html.parser")
videourl = soup.find('source')['src']
if videourl:
print "- A sacar: " + title
cmd = 'wget "' + videourl + '" -O "' + destfn + '"'
os.system(cmd + "> /dev/null 2>&1")
print "- Done"
return True
if __name__ == "__main__":
if len(sys.argv) != 2:
sys.exit("Correr com "+sys.argv[0]+" [progName]")
if sys.argv[1]:
progName = sys.argv[1]
exists = 0
offset = ""
while True:
url = "http://sicradical.sapo.pt/api/molecule/category/programas/" + progName + "/videos?offset=" + offset
headers = { 'User-Agent' : agent }
req = urllib2.Request(url, None, headers)
html = urllib2.urlopen(req).read()
soup = BeautifulSoup(html, "html.parser")
if soup.find('article') is None:
sys.exit("ultima pagina")
# apanha todos os items da pagina
items = soup.findAll('article')
for item in items:
if exists >= 5:
sys.exit("A sair apos 5 falhas, ja devo ter tudo...")
# url
link = item.find('a')['href']
# data
dt = item.find('p', {'class': 'timeStamp'})['datetime']
offset = dt
if parsePage(link, progName) is False:
exists = exists + 1