78 lines
2.2 KiB
Python
78 lines
2.2 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
# vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4
|
|
|
|
import urllib2
|
|
import re
|
|
import os
|
|
import sys
|
|
from bs4 import BeautifulSoup
|
|
|
|
scriptpath = os.path.dirname(os.path.realpath(__file__))
|
|
agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36' \
|
|
'(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
|
|
|
|
def parsePage(url, progName):
|
|
match = re.search(r"/videos/([\w+|-]+)", url)
|
|
title = match.group(1)
|
|
|
|
programpath = scriptpath + "/" + progName
|
|
if os.path.isdir(programpath) is False:
|
|
os.makedirs(programpath)
|
|
destfn = programpath + "/" + title + '.mp4'
|
|
|
|
if os.path.isfile(destfn):
|
|
print "- Ja downloadada... a ignorar"
|
|
return False
|
|
|
|
headers = { 'User-Agent' : agent }
|
|
req = urllib2.Request(url, None, headers)
|
|
html = urllib2.urlopen(req).read()
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
videourl = soup.find('source')['src']
|
|
if videourl:
|
|
print "- A sacar: " + title
|
|
cmd = 'wget "' + videourl + '" -O "' + destfn + '"'
|
|
os.system(cmd + "> /dev/null 2>&1")
|
|
print "- Done"
|
|
return True
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) != 2:
|
|
sys.exit("Correr com "+sys.argv[0]+" [progName]")
|
|
|
|
if sys.argv[1]:
|
|
progName = sys.argv[1]
|
|
|
|
exists = 0
|
|
offset = ""
|
|
while True:
|
|
url = "http://sicradical.sapo.pt/api/molecule/category/programas/" + progName + "/videos?offset=" + offset
|
|
|
|
headers = { 'User-Agent' : agent }
|
|
req = urllib2.Request(url, None, headers)
|
|
html = urllib2.urlopen(req).read()
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
if soup.find('article') is None:
|
|
sys.exit("ultima pagina")
|
|
|
|
# apanha todos os items da pagina
|
|
items = soup.findAll('article')
|
|
|
|
for item in items:
|
|
if exists >= 5:
|
|
sys.exit("A sair apos 5 falhas, ja devo ter tudo...")
|
|
|
|
# url
|
|
link = item.find('a')['href']
|
|
# data
|
|
dt = item.find('p', {'class': 'timeStamp'})['datetime']
|
|
offset = dt
|
|
|
|
if parsePage(link, progName) is False:
|
|
exists = exists + 1
|