diff --git a/tudown.py b/tudown.py index d3a3e3b..788c79b 100644 --- a/tudown.py +++ b/tudown.py @@ -1,113 +1,144 @@ #!/usr/bin/python3 +from threading import Thread +import threading from requests import Session, utils from lxml import html -from re import compile, findall, match +from re import compile, findall from os import makedirs from os.path import exists, getmtime from calendar import timegm -from time import strptime +from time import strptime, clock, time, sleep + +NUM_THREADS = 5 # It's ok.... def create_filepath(filepath): - if not exists(filepath): - makedirs(filepath) + if not exists(filepath): + makedirs(filepath) + + +def download_files(session, f): + filename = f[1] + utils.unquote(f[0])[utils.unquote(f[0]).rindex('/'):] + if not exists(filename): + response = session.get(f[0]) + if response.status_code == 200: + create_filepath(f[1]) + with open(filename, 'wb') as fd: + for chunk in response.iter_content(1024): + fd.write(chunk) + print('[+] ' + filename) + else: + response = session.head(f[0]) + if response.status_code == 200: + last_mod_file = getmtime(filename) + last_mod_www = timegm(strptime(response.headers['Last-Modified'], '%a, %d %b %Y %H:%M:%S %Z')) + if last_mod_www > last_mod_file: + response = session.get(f[0]) + if response.status_code == 200: + create_filepath(f[1]) + with open(filename, 'wb') as fd: + for chunk in response.iter_content(1024): + fd.write(chunk) + print('[M] ' + filename) -def download_files(session, files): - for f in files: - filename = f[1] + utils.unquote(f[0])[utils.unquote(f[0]).rindex('/'):] - if not exists(filename): - response = session.get(f[0]) - if response.status_code == 200: - create_filepath(f[1]) - with open(filename, 'wb') as f: - for chunk in response.iter_content(1024): - f.write(chunk) - print('[+] ' + filename) - else: - response = session.head(f[0]) - if response.status_code == 200: - last_mod_file = getmtime(filename) - last_mod_www = timegm(strptime(response.headers['Last-Modified'], '%a, %d %b %Y %H:%M:%S %Z')) - if last_mod_www > last_mod_file: - response = session.get(f[0]) - if response.status_code == 200: - create_filepath(f[1]) - with open(filename, 'wb') as f: - for chunk in response.iter_content(1024): - f.write(chunk) - print('[M] ' + filename) def resolve_direct_links(session, hrefs): - links = [] - for href in hrefs: - tmp = session.head(href).headers - if 'Location' in tmp: - links.append(tmp['Location']) - return links + links = [] + t = clock() + for href in hrefs: + tmp = session.head(href).headers + if 'Location' in tmp: + links.append(tmp['Location']) + #print("delta resolve:", clock() - t) + return links + def get_links_from_folder(session, urls): - hrefs = [] - for url in urls: - response = session.get(url) - hrefs += findall(compile('https\:\/\/www\.moodle\.tum\.de\/pluginfile\.php\/\d{6}\/mod_folder\/content\/0\/(?:[\w\d\_\-]*\/)*[\w\d\_\-\.]{1,}'), response.text) - return hrefs + hrefs = [] + t = clock() + for url in urls: + response = session.get(url) + hrefs += findall(compile( + 'https\:\/\/www\.moodle\.tum\.de\/pluginfile\.php\/\d{6}\/mod_folder\/content\/0\/(?:[\w\d\_\-]*\/)*[\w\d\_\-\.]{1,}'), + response.text) + #print("delta folder:", clock() - t) + return hrefs + def get_file_links(session, url, files): - links = [] + links = [] - response = session.get(url) + response = session.get(url) - if 'www.moodle.tum.de' in url: - # get file links - hrefs = findall(compile('https\:\/\/www\.moodle\.tum\.de\/mod\/resource\/view\.php\?id\=\d{6}'), response.text) - # resolve all links to direct to the files - hrefs = resolve_direct_links(session, hrefs) - # get folder links - folders = findall(compile('https\:\/\/www\.moodle\.tum\.de\/mod\/folder\/view\.php\?id\=\d{6}'), response.text) - if folders: - hrefs += get_links_from_folder(session, folders) - else: - hrefs = html.fromstring(response.text).xpath('//a/@href') + if 'www.moodle.tum.de' in url: + # get file links + hrefs = findall(compile('https\:\/\/www\.moodle\.tum\.de\/mod\/resource\/view\.php\?id\=\d{6}'), response.text) + # resolve all links to direct to the files + hrefs = resolve_direct_links(session, hrefs) + # get folder links + folders = findall(compile('https\:\/\/www\.moodle\.tum\.de\/mod\/folder\/view\.php\?id\=\d{6}'), response.text) + if folders: + hrefs += get_links_from_folder(session, folders) + else: + hrefs = html.fromstring(response.text).xpath('//a/@href') -# --------------- + # --------------- + t = clock() + for f in files: + reg = compile(f[0]) + for href in hrefs: + match = reg.findall(href) + if match: + if not ('https://' in href or 'http://' in href): + links.append((url + href, f[1])) + else: + links.append((href, f[1])) + #print("delta regex:", clock() - t) + return links - for f in files: - reg = compile(f[0]) - for href in hrefs: - match = reg.findall(href) - if match: - if not ('https://' in href or 'http://' in href): - links.append((url + href, f[1])) - else: - links.append((href, f[1])) - - return links def establish_moodle_session(user, passwd): - session = Session() + session = Session() - session.get('https://www.moodle.tum.de/Shibboleth.sso/Login?providerId=https://tumidp.lrz.de/idp/shibboleth&target=https://www.moodle.tum.de/auth/shibboleth/index.php') - response = session.post('https://tumidp.lrz.de/idp/Authn/UserPassword', data={'j_username':user, 'j_password':passwd}) + session.get( + 'https://www.moodle.tum.de/Shibboleth.sso/Login?providerId=https://tumidp.lrz.de/idp/shibboleth&target=https://www.moodle.tum.de/auth/shibboleth/index.php') + response = session.post('https://tumidp.lrz.de/idp/Authn/UserPassword', + data={'j_username': user, 'j_password': passwd}) - parsed = html.fromstring(response.text) + parsed = html.fromstring(response.text) - session.post('https://www.moodle.tum.de/Shibboleth.sso/SAML2/POST', data={'RelayState':parsed.forms[0].fields['RelayState'], 'SAMLResponse':parsed.forms[0].fields['SAMLResponse']}) + session.post('https://www.moodle.tum.de/Shibboleth.sso/SAML2/POST', + data={'RelayState': parsed.forms[0].fields['RelayState'], + 'SAMLResponse': parsed.forms[0].fields['SAMLResponse']}) + + return session - return session def main(url, files, user='', passwd=''): - # create session - session = None - if 'www.moodle.tum.de' in url: - session = establish_moodle_session(user, passwd) - else: - session = Session() - session.auth = (user, passwd) - session.headers = { - "Accept-Language": "de-DE,de;" - } + # create session + t = clock() + if 'www.moodle.tum.de' in url: + session = establish_moodle_session(user, passwd) + else: + session = Session() + session.auth = (user, passwd) + session.headers = { + "Accept-Language": "de-DE,de;" + } + #print("delta session:", clock() - t) - # get file links - links = get_file_links(session, url, files) - # download files - download_files(session, links) + # get file links + links = get_file_links(session, url, files) + + # download files + #print(threading.active_count()) + t1 = clock() + worker = [] + for l in links: + while threading.active_count() > NUM_THREADS: + sleep(0.1) + worker.append(Thread(target=download_files, args=(session, l)).start()) + + [t.join() for t in worker if t] + #print("delta download threaded:", clock() - t1) + diff --git a/update.py.example b/update.py.example index 6e016d3..81daa99 100755 --- a/update.py.example +++ b/update.py.example @@ -11,8 +11,8 @@ passwd = '' url = 'http://wwwmayr.informatik.tu-muenchen.de/lehre/2015SS/theo/' files = [ - ('2015-theo\.pdf', 'Skript'), - ('2015-\d{2}-\d{2}\.pdf', 'Skript'), + ('2015-theo\.pdf', 'Skript'), + ('2015-\d{2}-\d{2}\.pdf', 'Skript'), ] tudown.main(url, files) @@ -24,9 +24,9 @@ tudown.main(url, files) url = 'http://wwwmayr.informatik.tu-muenchen.de/lehre/2015SS/theo/uebung/' files = [ - ('ue\d*\.pdf', 'Übungsblätter'), - ('loesungen/lo\d*\.pdf', 'Lösungsblätter'), - ('theo15zue\d*_druck\.pdf', 'Skript/ZÜ'), + ('ue\d*\.pdf', 'Übungsblätter'), + ('loesungen/lo\d*\.pdf', 'Lösungsblätter'), + ('theo15zue\d*_druck\.pdf', 'Skript/ZÜ'), ] tudown.main(url, files, user=user, passwd=passwd)