Implemented threaded downloading

This commit is contained in:
Maximilian Pudelko
2015-04-21 19:08:34 +02:00
parent c5e397e028
commit b40ff0de87

200
tudown.py
View File

@@ -1,113 +1,145 @@
#!/usr/bin/python3 #!/usr/bin/python3
from threading import Thread
import threading
from requests import Session, utils from requests import Session, utils
from lxml import html from lxml import html
from re import compile, findall, match from re import compile, findall
from os import makedirs from os import makedirs
from os.path import exists, getmtime from os.path import exists, getmtime
from calendar import timegm from calendar import timegm
from time import strptime from time import strptime, clock, time, sleep
NUM_THREADS = 5 # It's ok....
def create_filepath(filepath): def create_filepath(filepath):
if not exists(filepath): if not exists(filepath):
makedirs(filepath) makedirs(filepath)
def download_files(session, f):
filename = f[1] + utils.unquote(f[0])[utils.unquote(f[0]).rindex('/'):]
if not exists(filename):
response = session.get(f[0])
if response.status_code == 200:
create_filepath(f[1])
with open(filename, 'wb') as fd:
for chunk in response.iter_content(1024):
fd.write(chunk)
print('[+] ' + filename)
else:
response = session.head(f[0])
if response.status_code == 200:
last_mod_file = getmtime(filename)
last_mod_www = timegm(strptime(response.headers['Last-Modified'], '%a, %d %b %Y %H:%M:%S %Z'))
if last_mod_www > last_mod_file:
response = session.get(f[0])
if response.status_code == 200:
create_filepath(f[1])
with open(filename, 'wb') as fd:
for chunk in response.iter_content(1024):
fd.write(chunk)
print('[M] ' + filename)
def download_files(session, files):
for f in files:
filename = f[1] + utils.unquote(f[0])[utils.unquote(f[0]).rindex('/'):]
if not exists(filename):
response = session.get(f[0])
if response.status_code == 200:
create_filepath(f[1])
with open(filename, 'wb') as f:
for chunk in response.iter_content(1024):
f.write(chunk)
print('[+] ' + filename)
else:
response = session.head(f[0])
if response.status_code == 200:
last_mod_file = getmtime(filename)
last_mod_www = timegm(strptime(response.headers['Last-Modified'], '%a, %d %b %Y %H:%M:%S %Z'))
if last_mod_www > last_mod_file:
response = session.get(f[0])
if response.status_code == 200:
create_filepath(f[1])
with open(filename, 'wb') as f:
for chunk in response.iter_content(1024):
f.write(chunk)
print('[M] ' + filename)
def resolve_direct_links(session, hrefs): def resolve_direct_links(session, hrefs):
links = [] links = []
for href in hrefs: t = clock()
tmp = session.head(href).headers for href in hrefs:
if 'Location' in tmp: tmp = session.head(href).headers
links.append(tmp['Location']) if 'Location' in tmp:
return links links.append(tmp['Location'])
#print("delta resolve:", clock() - t)
return links
def get_links_from_folder(session, urls): def get_links_from_folder(session, urls):
hrefs = [] hrefs = []
for url in urls: t = clock()
response = session.get(url) for url in urls:
hrefs += findall(compile('https\:\/\/www\.moodle\.tum\.de\/pluginfile\.php\/\d{6}\/mod_folder\/content\/0\/(?:[\w\d\_\-]*\/)*[\w\d\_\-\.]{1,}'), response.text) response = session.get(url)
return hrefs hrefs += findall(compile(
'https\:\/\/www\.moodle\.tum\.de\/pluginfile\.php\/\d{6}\/mod_folder\/content\/0\/(?:[\w\d\_\-]*\/)*[\w\d\_\-\.]{1,}'),
response.text)
#print("delta folder:", clock() - t)
return hrefs
def get_file_links(session, url, files): def get_file_links(session, url, files):
links = [] links = []
response = session.get(url) response = session.get(url)
if 'www.moodle.tum.de' in url: if 'www.moodle.tum.de' in url:
# get file links # get file links
hrefs = findall(compile('https\:\/\/www\.moodle\.tum\.de\/mod\/resource\/view\.php\?id\=\d{6}'), response.text) hrefs = findall(compile('https\:\/\/www\.moodle\.tum\.de\/mod\/resource\/view\.php\?id\=\d{6}'), response.text)
# resolve all links to direct to the files # resolve all links to direct to the files
hrefs = resolve_direct_links(session, hrefs) hrefs = resolve_direct_links(session, hrefs)
# get folder links # get folder links
folders = findall(compile('https\:\/\/www\.moodle\.tum\.de\/mod\/folder\/view\.php\?id\=\d{6}'), response.text) folders = findall(compile('https\:\/\/www\.moodle\.tum\.de\/mod\/folder\/view\.php\?id\=\d{6}'), response.text)
if folders: if folders:
hrefs += get_links_from_folder(session, folders) hrefs += get_links_from_folder(session, folders)
else: else:
hrefs = html.fromstring(response.text).xpath('//a/@href') hrefs = html.fromstring(response.text).xpath('//a/@href')
# --------------- # ---------------
t = clock()
for f in files:
reg = compile(f[0])
for href in hrefs:
match = reg.findall(href)
if match:
if not ('https://' in href or 'http://' in href):
links.append((url + href, f[1]))
else:
links.append((href, f[1]))
#print("delta regex:", clock() - t)
return links
for f in files:
reg = compile(f[0])
for href in hrefs:
match = reg.findall(href)
if match:
if not ('https://' in href or 'http://' in href):
links.append((url + href, f[1]))
else:
links.append((href, f[1]))
return links
def establish_moodle_session(user, passwd): def establish_moodle_session(user, passwd):
session = Session() session = Session()
session.get('https://www.moodle.tum.de/Shibboleth.sso/Login?providerId=https://tumidp.lrz.de/idp/shibboleth&target=https://www.moodle.tum.de/auth/shibboleth/index.php') session.get(
response = session.post('https://tumidp.lrz.de/idp/Authn/UserPassword', data={'j_username':user, 'j_password':passwd}) 'https://www.moodle.tum.de/Shibboleth.sso/Login?providerId=https://tumidp.lrz.de/idp/shibboleth&target=https://www.moodle.tum.de/auth/shibboleth/index.php')
response = session.post('https://tumidp.lrz.de/idp/Authn/UserPassword',
data={'j_username': user, 'j_password': passwd})
parsed = html.fromstring(response.text) parsed = html.fromstring(response.text)
session.post('https://www.moodle.tum.de/Shibboleth.sso/SAML2/POST', data={'RelayState':parsed.forms[0].fields['RelayState'], 'SAMLResponse':parsed.forms[0].fields['SAMLResponse']}) session.post('https://www.moodle.tum.de/Shibboleth.sso/SAML2/POST',
data={'RelayState': parsed.forms[0].fields['RelayState'],
'SAMLResponse': parsed.forms[0].fields['SAMLResponse']})
return session
return session
def main(url, files, user='', passwd=''): def main(url, files, user='', passwd=''):
# create session # create session
session = None t = clock()
if 'www.moodle.tum.de' in url: if 'www.moodle.tum.de' in url:
session = establish_moodle_session(user, passwd) session = establish_moodle_session(user, passwd)
else: else:
session = Session() session = Session()
session.auth = (user, passwd) session.auth = (user, passwd)
session.headers = { session.headers = {
"Accept-Language": "de-DE,de;" "Accept-Language": "de-DE,de;"
} }
#print("delta session:", clock() - t)
# get file links
links = get_file_links(session, url, files)
# download files # get file links
download_files(session, links) links = get_file_links(session, url, files)
# download files
#print(threading.active_count())
t1 = clock()
for l in links:
while threading.active_count() > NUM_THREADS:
sleep(0.02)
Thread(target=download_files, args=(session, l)).start()
while threading.active_count() > 1:
#print(threading.active_count())
sleep(0.5)
#print("delta download threaded:", clock() - t1)