commit a8d20b732a20b528facf97ac09e709c517655587 Author: Alexander Weidinger Date: Mon May 18 16:27:41 2020 +0200 Add last version used with example configuration diff --git a/tudown.json b/tudown.json new file mode 100644 index 0000000..1886333 --- /dev/null +++ b/tudown.json @@ -0,0 +1,17 @@ +[ + { + "url": "https://www.moodle.tum.de/course/view.php?id=49093", + "files": [ + { + "regex": "Lec\\d+-.*\\.pdf", + "folder": "slides" + }, + { + "regex": "A\\d+.*\\.pdf", + "folder": "assignments" + } + ], + "username_script": ["/home/alex/Scripts/get-username.sh", "Uni/LRZ"], + "password_script": ["/home/alex/Scripts/get-password.sh", "Uni/LRZ"] + } +] diff --git a/tudown.py b/tudown.py new file mode 100755 index 0000000..855baf1 --- /dev/null +++ b/tudown.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +import requests +import subprocess +from lxml import html +import json +import re +from requests import utils +from os.path import exists, getmtime +from os import makedirs +from calendar import timegm +from time import strptime, sleep +import sys + +def create_filepath(filepath): + if not exists(filepath): + makedirs(filepath) + +def download_files(session, f): + filename = f[1] + utils.unquote(f[0])[utils.unquote(f[0]).rindex('/'):] + if not exists(filename): + response = session.get(f[0], allow_redirects=False) + if response.status_code == 301: + download_files(session, (response.headers['Location'], f[1])) + elif response.status_code == 200: + create_filepath(f[1]) + with open(filename, 'wb') as fd: + for chunk in response.iter_content(1024): + fd.write(chunk) + print('[+] ' + filename) + else: + response = session.head(f[0], allow_redirects=False) + if response.status_code == 301: + download_files(session, (response.headers['Location'], f[1])) + elif response.status_code == 200: + last_mod_file = getmtime(filename) + try: + last_mod_www = timegm(strptime(response.headers['Last-Modified'], '%a, %d %b %Y %H:%M:%S %Z')) + except KeyError: + print('Can\'t check {} for updates.'.format(f[0])) + last_mod_www = last_mod_file + + if last_mod_www > last_mod_file: + response = session.get(f[0]) + if response.status_code == 200: + create_filepath(f[1]) + with open(filename, 'wb') as fd: + for chunk in response.iter_content(1024): + fd.write(chunk) + print('[M] ' + filename) + +def get_moodle_session(username, password): + session = requests.session() + r = session.get('https://www.moodle.tum.de/Shibboleth.sso/Login?providerId=https%3A%2F%2Ftumidp.lrz.de%2Fidp%2Fshibboleth&target=https%3A%2F%2Fwww.moodle.tum.de%2Fauth%2Fshibboleth%2Findex.php') + r = session.post(r.url, data={'j_username':username, 'j_password':password, '_eventId_proceed':''}) + html_resp = html.fromstring(r.text) + r = session.post('https://www.moodle.tum.de/Shibboleth.sso/SAML2/POST', data={'RelayState':html_resp.forms[0].fields['RelayState'], 'SAMLResponse':html_resp.forms[0].fields['SAMLResponse']}) + return session + +def get_resource(j, verbose): + # establish a session + session = None + username = None + password = None + + # extract username and password + try: + with subprocess.Popen([j['username_script'][0], j['username_script'][1]], stdout=subprocess.PIPE) as proc: + username = proc.stdout.read().strip() + with subprocess.Popen([j['password_script'][0], j['password_script'][1]], stdout=subprocess.PIPE) as proc: + password = proc.stdout.read().strip() + except KeyError: + None + + try: + username = j['username'] + password = j['password'] + except KeyError: + None + + if 'moodle.tum.de' in j['url']: + session = get_moodle_session(username, password) + else: + session = requests.session() + try: + session.auth = (username, password) + except (KeyError): + None + + # separate url + protocol, url = j['url'].split('://', 1) + hostname, path = url.split('/', 1) + + + # download + r = session.get(j['url']) + hrefs = html.fromstring(r.text).xpath('//a/@href') + abs_hrefs = [] + + for href in hrefs: + # strip ./ from relative href + if href.startswith('./'): + href = href[2:] + + if href.startswith('https://www.moodle.tum.de/mod/resource/view.php?id='): + tmp = session.head(href).headers + abs_hrefs.append(tmp['Location']) + continue + if href.startswith('/'): + abs_hrefs.append(protocol + '://' + hostname + href) + continue + if href.startswith('http://') or href.startswith('https://'): + abs_hrefs.append(href) + continue + if href.startswith('../'): + abs_hrefs.append(j['url'].rsplit('/', 1)[0].rsplit('/', 1)[0] + '/' + href[3:]) + continue + # else + abs_hrefs.append(j['url'].rsplit('/', 1)[0] + '/' + href) + + if verbose: + print(abs_hrefs) + print([href for href in hrefs if '.pdf' in href]) + + for href in abs_hrefs: + for file in j['files']: + if not 'regex' in file: + download_files(session, [href, file['folder']]) + break + if re.search(file['regex'], href): + download_files(session, [href, file['folder']]) + break + +def main(): + if not exists('tudown.json'): + print('tudown.json not found') + sys.exit(1) + else: + verbose = False + # verbose mode + if len(sys.argv) > 1: + if sys.argv[1] == '-v': + verbose = True + + # open download configuration into json + fh = open('tudown.json', 'r') + j = json.loads(fh.read()) + fh.close() + + for resource in j: + get_resource(resource, verbose) + + +if __name__ == '__main__': + main()