just some cleanup.

This commit is contained in:
Alexander Weidinger
2015-04-23 16:06:28 +02:00
parent d13ec7ebba
commit e2107b15b0
2 changed files with 8 additions and 21 deletions

View File

@@ -1,15 +1,14 @@
#!/usr/bin/python3 #!/usr/bin/python3
from threading import Thread from threading import Thread, active_count
import threading
from requests import Session, utils from requests import Session, utils
from lxml import html from lxml import html
from re import compile, findall from re import compile, findall
from os import makedirs from os import makedirs
from os.path import exists, getmtime from os.path import exists, getmtime
from calendar import timegm from calendar import timegm
from time import strptime, clock, time, sleep from time import strptime, sleep
NUM_THREADS = 5 # It's ok.... NUM_THREADS = 5 # max number of threads
def create_filepath(filepath): def create_filepath(filepath):
if not exists(filepath): if not exists(filepath):
@@ -43,24 +42,20 @@ def download_files(session, f):
def resolve_direct_links(session, hrefs): def resolve_direct_links(session, hrefs):
links = [] links = []
t = clock()
for href in hrefs: for href in hrefs:
tmp = session.head(href).headers tmp = session.head(href).headers
if 'Location' in tmp: if 'Location' in tmp:
links.append(tmp['Location']) links.append(tmp['Location'])
#print("delta resolve:", clock() - t)
return links return links
def get_links_from_folder(session, urls): def get_links_from_folder(session, urls):
hrefs = [] hrefs = []
t = clock()
for url in urls: for url in urls:
response = session.get(url) response = session.get(url)
hrefs += findall(compile( hrefs += findall(compile(
'https\:\/\/www\.moodle\.tum\.de\/pluginfile\.php\/\d{6}\/mod_folder\/content\/0\/(?:[\w\d\_\-]*\/)*[\w\d\_\-\.]{1,}'), 'https\:\/\/www\.moodle\.tum\.de\/pluginfile\.php\/\d{6}\/mod_folder\/content\/0\/(?:[\w\d\_\-]*\/)*[\w\d\_\-\.]{1,}'),
response.text) response.text)
#print("delta folder:", clock() - t)
return hrefs return hrefs
@@ -82,7 +77,7 @@ def get_file_links(session, url, files):
hrefs = html.fromstring(response.text).xpath('//a/@href') hrefs = html.fromstring(response.text).xpath('//a/@href')
# --------------- # ---------------
t = clock()
for f in files: for f in files:
reg = compile(f[0]) reg = compile(f[0])
for href in hrefs: for href in hrefs:
@@ -92,7 +87,6 @@ def get_file_links(session, url, files):
links.append((url + href, f[1])) links.append((url + href, f[1]))
else: else:
links.append((href, f[1])) links.append((href, f[1]))
#print("delta regex:", clock() - t)
return links return links
@@ -115,7 +109,6 @@ def establish_moodle_session(user, passwd):
def main(url, files, user='', passwd=''): def main(url, files, user='', passwd=''):
# create session # create session
t = clock()
if 'www.moodle.tum.de' in url: if 'www.moodle.tum.de' in url:
session = establish_moodle_session(user, passwd) session = establish_moodle_session(user, passwd)
else: else:
@@ -124,21 +117,15 @@ def main(url, files, user='', passwd=''):
session.headers = { session.headers = {
"Accept-Language": "de-DE,de;" "Accept-Language": "de-DE,de;"
} }
#print("delta session:", clock() - t)
# get file links # get file links
links = get_file_links(session, url, files) links = get_file_links(session, url, files)
# download files # download files
#print(threading.active_count())
t1 = clock()
worker = [] worker = []
for l in links: for l in links:
while threading.active_count() > NUM_THREADS: while active_count() > NUM_THREADS:
sleep(0.1) sleep(0.1)
worker.append(Thread(target=download_files, args=(session, l)).start()) worker.append(Thread(target=download_files, args=(session, l)).start())
[t.join() for t in worker if t] [t.join() for t in worker if t]
#print("delta download threaded:", clock() - t1)

View File

@@ -24,9 +24,9 @@ tudown.main(url, files)
url = 'http://wwwmayr.informatik.tu-muenchen.de/lehre/2015SS/theo/uebung/' url = 'http://wwwmayr.informatik.tu-muenchen.de/lehre/2015SS/theo/uebung/'
files = [ files = [
('ue\d*\.pdf', 'Übungsblätter'), ('ue\d{2}\.pdf', 'Übungsblätter'),
('loesungen/lo\d*\.pdf', 'Lösungsblätter'), ('lo\d{2}_HA\.pdf', 'Lösungsblätter'),
('theo15zue\d*_druck\.pdf', 'Skript/ZÜ'), ('theo15zue\d{2}_druck\.pdf', 'Skript/ZÜ'),
] ]
tudown.main(url, files, user=user, passwd=passwd) tudown.main(url, files, user=user, passwd=passwd)