#!/usr/bin/env python3 import requests import subprocess from lxml import html import json import re from requests import utils from os.path import exists, getmtime from os import makedirs from calendar import timegm from time import strptime, sleep import sys def create_filepath(filepath): if not exists(filepath): makedirs(filepath) def download_files(session, f): filename = f[1] + utils.unquote(f[0])[utils.unquote(f[0]).rindex('/'):] if not exists(filename): response = session.get(f[0], allow_redirects=False) if response.status_code == 301: download_files(session, (response.headers['Location'], f[1])) elif response.status_code == 200: create_filepath(f[1]) with open(filename, 'wb') as fd: for chunk in response.iter_content(1024): fd.write(chunk) print('[+] ' + filename) else: response = session.head(f[0], allow_redirects=False) if response.status_code == 301: download_files(session, (response.headers['Location'], f[1])) elif response.status_code == 200: last_mod_file = getmtime(filename) try: last_mod_www = timegm(strptime(response.headers['Last-Modified'], '%a, %d %b %Y %H:%M:%S %Z')) except KeyError: print('Can\'t check {} for updates.'.format(f[0])) last_mod_www = last_mod_file if last_mod_www > last_mod_file: response = session.get(f[0]) if response.status_code == 200: create_filepath(f[1]) with open(filename, 'wb') as fd: for chunk in response.iter_content(1024): fd.write(chunk) print('[M] ' + filename) def get_moodle_session(username, password): session = requests.session() r = session.get('https://www.moodle.tum.de/Shibboleth.sso/Login?providerId=https%3A%2F%2Ftumidp.lrz.de%2Fidp%2Fshibboleth&target=https%3A%2F%2Fwww.moodle.tum.de%2Fauth%2Fshibboleth%2Findex.php') r = session.post(r.url, data={'j_username':username, 'j_password':password, '_eventId_proceed':''}) html_resp = html.fromstring(r.text) r = session.post('https://www.moodle.tum.de/Shibboleth.sso/SAML2/POST', data={'RelayState':html_resp.forms[0].fields['RelayState'], 'SAMLResponse':html_resp.forms[0].fields['SAMLResponse']}) return session def get_resource(j, verbose): # establish a session session = None username = None password = None # extract username and password try: with subprocess.Popen([j['username_script'][0], j['username_script'][1]], stdout=subprocess.PIPE) as proc: username = proc.stdout.read().strip() with subprocess.Popen([j['password_script'][0], j['password_script'][1]], stdout=subprocess.PIPE) as proc: password = proc.stdout.read().strip() except KeyError: None try: username = j['username'] password = j['password'] except KeyError: None if 'moodle.tum.de' in j['url']: session = get_moodle_session(username, password) else: session = requests.session() try: session.auth = (username, password) except (KeyError): None # separate url protocol, url = j['url'].split('://', 1) hostname, path = url.split('/', 1) # download r = session.get(j['url']) hrefs = html.fromstring(r.text).xpath('//a/@href') abs_hrefs = [] for href in hrefs: # strip ./ from relative href if href.startswith('./'): href = href[2:] if href.startswith('https://www.moodle.tum.de/mod/resource/view.php?id='): tmp = session.head(href).headers abs_hrefs.append(tmp['Location']) continue if href.startswith('/'): abs_hrefs.append(protocol + '://' + hostname + href) continue if href.startswith('http://') or href.startswith('https://'): abs_hrefs.append(href) continue if href.startswith('../'): abs_hrefs.append(j['url'].rsplit('/', 1)[0].rsplit('/', 1)[0] + '/' + href[3:]) continue # else abs_hrefs.append(j['url'].rsplit('/', 1)[0] + '/' + href) if verbose: print(abs_hrefs) print([href for href in hrefs if '.pdf' in href]) for href in abs_hrefs: for file in j['files']: if not 'regex' in file: download_files(session, [href, file['folder']]) break if re.search(file['regex'], href): download_files(session, [href, file['folder']]) break def main(): if not exists('tudown.json'): print('tudown.json not found') sys.exit(1) else: verbose = False # verbose mode if len(sys.argv) > 1: if sys.argv[1] == '-v': verbose = True # open download configuration into json fh = open('tudown.json', 'r') j = json.loads(fh.read()) fh.close() for resource in j: get_resource(resource, verbose) if __name__ == '__main__': main()