just some cleanup.
This commit is contained in:
23
tudown.py
23
tudown.py
@@ -1,15 +1,14 @@
|
|||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
from threading import Thread
|
from threading import Thread, active_count
|
||||||
import threading
|
|
||||||
from requests import Session, utils
|
from requests import Session, utils
|
||||||
from lxml import html
|
from lxml import html
|
||||||
from re import compile, findall
|
from re import compile, findall
|
||||||
from os import makedirs
|
from os import makedirs
|
||||||
from os.path import exists, getmtime
|
from os.path import exists, getmtime
|
||||||
from calendar import timegm
|
from calendar import timegm
|
||||||
from time import strptime, clock, time, sleep
|
from time import strptime, sleep
|
||||||
|
|
||||||
NUM_THREADS = 5 # It's ok....
|
NUM_THREADS = 5 # max number of threads
|
||||||
|
|
||||||
def create_filepath(filepath):
|
def create_filepath(filepath):
|
||||||
if not exists(filepath):
|
if not exists(filepath):
|
||||||
@@ -43,24 +42,20 @@ def download_files(session, f):
|
|||||||
|
|
||||||
def resolve_direct_links(session, hrefs):
|
def resolve_direct_links(session, hrefs):
|
||||||
links = []
|
links = []
|
||||||
t = clock()
|
|
||||||
for href in hrefs:
|
for href in hrefs:
|
||||||
tmp = session.head(href).headers
|
tmp = session.head(href).headers
|
||||||
if 'Location' in tmp:
|
if 'Location' in tmp:
|
||||||
links.append(tmp['Location'])
|
links.append(tmp['Location'])
|
||||||
#print("delta resolve:", clock() - t)
|
|
||||||
return links
|
return links
|
||||||
|
|
||||||
|
|
||||||
def get_links_from_folder(session, urls):
|
def get_links_from_folder(session, urls):
|
||||||
hrefs = []
|
hrefs = []
|
||||||
t = clock()
|
|
||||||
for url in urls:
|
for url in urls:
|
||||||
response = session.get(url)
|
response = session.get(url)
|
||||||
hrefs += findall(compile(
|
hrefs += findall(compile(
|
||||||
'https\:\/\/www\.moodle\.tum\.de\/pluginfile\.php\/\d{6}\/mod_folder\/content\/0\/(?:[\w\d\_\-]*\/)*[\w\d\_\-\.]{1,}'),
|
'https\:\/\/www\.moodle\.tum\.de\/pluginfile\.php\/\d{6}\/mod_folder\/content\/0\/(?:[\w\d\_\-]*\/)*[\w\d\_\-\.]{1,}'),
|
||||||
response.text)
|
response.text)
|
||||||
#print("delta folder:", clock() - t)
|
|
||||||
return hrefs
|
return hrefs
|
||||||
|
|
||||||
|
|
||||||
@@ -82,7 +77,7 @@ def get_file_links(session, url, files):
|
|||||||
hrefs = html.fromstring(response.text).xpath('//a/@href')
|
hrefs = html.fromstring(response.text).xpath('//a/@href')
|
||||||
|
|
||||||
# ---------------
|
# ---------------
|
||||||
t = clock()
|
|
||||||
for f in files:
|
for f in files:
|
||||||
reg = compile(f[0])
|
reg = compile(f[0])
|
||||||
for href in hrefs:
|
for href in hrefs:
|
||||||
@@ -92,7 +87,6 @@ def get_file_links(session, url, files):
|
|||||||
links.append((url + href, f[1]))
|
links.append((url + href, f[1]))
|
||||||
else:
|
else:
|
||||||
links.append((href, f[1]))
|
links.append((href, f[1]))
|
||||||
#print("delta regex:", clock() - t)
|
|
||||||
return links
|
return links
|
||||||
|
|
||||||
|
|
||||||
@@ -115,7 +109,6 @@ def establish_moodle_session(user, passwd):
|
|||||||
|
|
||||||
def main(url, files, user='', passwd=''):
|
def main(url, files, user='', passwd=''):
|
||||||
# create session
|
# create session
|
||||||
t = clock()
|
|
||||||
if 'www.moodle.tum.de' in url:
|
if 'www.moodle.tum.de' in url:
|
||||||
session = establish_moodle_session(user, passwd)
|
session = establish_moodle_session(user, passwd)
|
||||||
else:
|
else:
|
||||||
@@ -124,21 +117,15 @@ def main(url, files, user='', passwd=''):
|
|||||||
session.headers = {
|
session.headers = {
|
||||||
"Accept-Language": "de-DE,de;"
|
"Accept-Language": "de-DE,de;"
|
||||||
}
|
}
|
||||||
#print("delta session:", clock() - t)
|
|
||||||
|
|
||||||
|
|
||||||
# get file links
|
# get file links
|
||||||
links = get_file_links(session, url, files)
|
links = get_file_links(session, url, files)
|
||||||
|
|
||||||
# download files
|
# download files
|
||||||
#print(threading.active_count())
|
|
||||||
t1 = clock()
|
|
||||||
worker = []
|
worker = []
|
||||||
for l in links:
|
for l in links:
|
||||||
while threading.active_count() > NUM_THREADS:
|
while active_count() > NUM_THREADS:
|
||||||
sleep(0.1)
|
sleep(0.1)
|
||||||
worker.append(Thread(target=download_files, args=(session, l)).start())
|
worker.append(Thread(target=download_files, args=(session, l)).start())
|
||||||
|
|
||||||
[t.join() for t in worker if t]
|
[t.join() for t in worker if t]
|
||||||
#print("delta download threaded:", clock() - t1)
|
|
||||||
|
|
||||||
|
|||||||
@@ -24,9 +24,9 @@ tudown.main(url, files)
|
|||||||
url = 'http://wwwmayr.informatik.tu-muenchen.de/lehre/2015SS/theo/uebung/'
|
url = 'http://wwwmayr.informatik.tu-muenchen.de/lehre/2015SS/theo/uebung/'
|
||||||
|
|
||||||
files = [
|
files = [
|
||||||
('ue\d*\.pdf', 'Übungsblätter'),
|
('ue\d{2}\.pdf', 'Übungsblätter'),
|
||||||
('loesungen/lo\d*\.pdf', 'Lösungsblätter'),
|
('lo\d{2}_HA\.pdf', 'Lösungsblätter'),
|
||||||
('theo15zue\d*_druck\.pdf', 'Skript/ZÜ'),
|
('theo15zue\d{2}_druck\.pdf', 'Skript/ZÜ'),
|
||||||
]
|
]
|
||||||
|
|
||||||
tudown.main(url, files, user=user, passwd=passwd)
|
tudown.main(url, files, user=user, passwd=passwd)
|
||||||
|
|||||||
Reference in New Issue
Block a user