Implemented threaded downloading
This commit is contained in:
200
tudown.py
200
tudown.py
@@ -1,113 +1,145 @@
|
|||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
|
from threading import Thread
|
||||||
|
import threading
|
||||||
from requests import Session, utils
|
from requests import Session, utils
|
||||||
from lxml import html
|
from lxml import html
|
||||||
from re import compile, findall, match
|
from re import compile, findall
|
||||||
from os import makedirs
|
from os import makedirs
|
||||||
from os.path import exists, getmtime
|
from os.path import exists, getmtime
|
||||||
from calendar import timegm
|
from calendar import timegm
|
||||||
from time import strptime
|
from time import strptime, clock, time, sleep
|
||||||
|
|
||||||
|
NUM_THREADS = 5 # It's ok....
|
||||||
|
|
||||||
def create_filepath(filepath):
|
def create_filepath(filepath):
|
||||||
if not exists(filepath):
|
if not exists(filepath):
|
||||||
makedirs(filepath)
|
makedirs(filepath)
|
||||||
|
|
||||||
|
|
||||||
|
def download_files(session, f):
|
||||||
|
filename = f[1] + utils.unquote(f[0])[utils.unquote(f[0]).rindex('/'):]
|
||||||
|
if not exists(filename):
|
||||||
|
response = session.get(f[0])
|
||||||
|
if response.status_code == 200:
|
||||||
|
create_filepath(f[1])
|
||||||
|
with open(filename, 'wb') as fd:
|
||||||
|
for chunk in response.iter_content(1024):
|
||||||
|
fd.write(chunk)
|
||||||
|
print('[+] ' + filename)
|
||||||
|
else:
|
||||||
|
response = session.head(f[0])
|
||||||
|
if response.status_code == 200:
|
||||||
|
last_mod_file = getmtime(filename)
|
||||||
|
last_mod_www = timegm(strptime(response.headers['Last-Modified'], '%a, %d %b %Y %H:%M:%S %Z'))
|
||||||
|
if last_mod_www > last_mod_file:
|
||||||
|
response = session.get(f[0])
|
||||||
|
if response.status_code == 200:
|
||||||
|
create_filepath(f[1])
|
||||||
|
with open(filename, 'wb') as fd:
|
||||||
|
for chunk in response.iter_content(1024):
|
||||||
|
fd.write(chunk)
|
||||||
|
print('[M] ' + filename)
|
||||||
|
|
||||||
def download_files(session, files):
|
|
||||||
for f in files:
|
|
||||||
filename = f[1] + utils.unquote(f[0])[utils.unquote(f[0]).rindex('/'):]
|
|
||||||
if not exists(filename):
|
|
||||||
response = session.get(f[0])
|
|
||||||
if response.status_code == 200:
|
|
||||||
create_filepath(f[1])
|
|
||||||
with open(filename, 'wb') as f:
|
|
||||||
for chunk in response.iter_content(1024):
|
|
||||||
f.write(chunk)
|
|
||||||
print('[+] ' + filename)
|
|
||||||
else:
|
|
||||||
response = session.head(f[0])
|
|
||||||
if response.status_code == 200:
|
|
||||||
last_mod_file = getmtime(filename)
|
|
||||||
last_mod_www = timegm(strptime(response.headers['Last-Modified'], '%a, %d %b %Y %H:%M:%S %Z'))
|
|
||||||
if last_mod_www > last_mod_file:
|
|
||||||
response = session.get(f[0])
|
|
||||||
if response.status_code == 200:
|
|
||||||
create_filepath(f[1])
|
|
||||||
with open(filename, 'wb') as f:
|
|
||||||
for chunk in response.iter_content(1024):
|
|
||||||
f.write(chunk)
|
|
||||||
print('[M] ' + filename)
|
|
||||||
|
|
||||||
def resolve_direct_links(session, hrefs):
|
def resolve_direct_links(session, hrefs):
|
||||||
links = []
|
links = []
|
||||||
for href in hrefs:
|
t = clock()
|
||||||
tmp = session.head(href).headers
|
for href in hrefs:
|
||||||
if 'Location' in tmp:
|
tmp = session.head(href).headers
|
||||||
links.append(tmp['Location'])
|
if 'Location' in tmp:
|
||||||
return links
|
links.append(tmp['Location'])
|
||||||
|
#print("delta resolve:", clock() - t)
|
||||||
|
return links
|
||||||
|
|
||||||
|
|
||||||
def get_links_from_folder(session, urls):
|
def get_links_from_folder(session, urls):
|
||||||
hrefs = []
|
hrefs = []
|
||||||
for url in urls:
|
t = clock()
|
||||||
response = session.get(url)
|
for url in urls:
|
||||||
hrefs += findall(compile('https\:\/\/www\.moodle\.tum\.de\/pluginfile\.php\/\d{6}\/mod_folder\/content\/0\/(?:[\w\d\_\-]*\/)*[\w\d\_\-\.]{1,}'), response.text)
|
response = session.get(url)
|
||||||
return hrefs
|
hrefs += findall(compile(
|
||||||
|
'https\:\/\/www\.moodle\.tum\.de\/pluginfile\.php\/\d{6}\/mod_folder\/content\/0\/(?:[\w\d\_\-]*\/)*[\w\d\_\-\.]{1,}'),
|
||||||
|
response.text)
|
||||||
|
#print("delta folder:", clock() - t)
|
||||||
|
return hrefs
|
||||||
|
|
||||||
|
|
||||||
def get_file_links(session, url, files):
|
def get_file_links(session, url, files):
|
||||||
links = []
|
links = []
|
||||||
|
|
||||||
response = session.get(url)
|
response = session.get(url)
|
||||||
|
|
||||||
if 'www.moodle.tum.de' in url:
|
if 'www.moodle.tum.de' in url:
|
||||||
# get file links
|
# get file links
|
||||||
hrefs = findall(compile('https\:\/\/www\.moodle\.tum\.de\/mod\/resource\/view\.php\?id\=\d{6}'), response.text)
|
hrefs = findall(compile('https\:\/\/www\.moodle\.tum\.de\/mod\/resource\/view\.php\?id\=\d{6}'), response.text)
|
||||||
# resolve all links to direct to the files
|
# resolve all links to direct to the files
|
||||||
hrefs = resolve_direct_links(session, hrefs)
|
hrefs = resolve_direct_links(session, hrefs)
|
||||||
# get folder links
|
# get folder links
|
||||||
folders = findall(compile('https\:\/\/www\.moodle\.tum\.de\/mod\/folder\/view\.php\?id\=\d{6}'), response.text)
|
folders = findall(compile('https\:\/\/www\.moodle\.tum\.de\/mod\/folder\/view\.php\?id\=\d{6}'), response.text)
|
||||||
if folders:
|
if folders:
|
||||||
hrefs += get_links_from_folder(session, folders)
|
hrefs += get_links_from_folder(session, folders)
|
||||||
else:
|
else:
|
||||||
hrefs = html.fromstring(response.text).xpath('//a/@href')
|
hrefs = html.fromstring(response.text).xpath('//a/@href')
|
||||||
|
|
||||||
# ---------------
|
# ---------------
|
||||||
|
t = clock()
|
||||||
|
for f in files:
|
||||||
|
reg = compile(f[0])
|
||||||
|
for href in hrefs:
|
||||||
|
match = reg.findall(href)
|
||||||
|
if match:
|
||||||
|
if not ('https://' in href or 'http://' in href):
|
||||||
|
links.append((url + href, f[1]))
|
||||||
|
else:
|
||||||
|
links.append((href, f[1]))
|
||||||
|
#print("delta regex:", clock() - t)
|
||||||
|
return links
|
||||||
|
|
||||||
for f in files:
|
|
||||||
reg = compile(f[0])
|
|
||||||
for href in hrefs:
|
|
||||||
match = reg.findall(href)
|
|
||||||
if match:
|
|
||||||
if not ('https://' in href or 'http://' in href):
|
|
||||||
links.append((url + href, f[1]))
|
|
||||||
else:
|
|
||||||
links.append((href, f[1]))
|
|
||||||
|
|
||||||
return links
|
|
||||||
|
|
||||||
def establish_moodle_session(user, passwd):
|
def establish_moodle_session(user, passwd):
|
||||||
session = Session()
|
session = Session()
|
||||||
|
|
||||||
session.get('https://www.moodle.tum.de/Shibboleth.sso/Login?providerId=https://tumidp.lrz.de/idp/shibboleth&target=https://www.moodle.tum.de/auth/shibboleth/index.php')
|
session.get(
|
||||||
response = session.post('https://tumidp.lrz.de/idp/Authn/UserPassword', data={'j_username':user, 'j_password':passwd})
|
'https://www.moodle.tum.de/Shibboleth.sso/Login?providerId=https://tumidp.lrz.de/idp/shibboleth&target=https://www.moodle.tum.de/auth/shibboleth/index.php')
|
||||||
|
response = session.post('https://tumidp.lrz.de/idp/Authn/UserPassword',
|
||||||
|
data={'j_username': user, 'j_password': passwd})
|
||||||
|
|
||||||
parsed = html.fromstring(response.text)
|
parsed = html.fromstring(response.text)
|
||||||
|
|
||||||
session.post('https://www.moodle.tum.de/Shibboleth.sso/SAML2/POST', data={'RelayState':parsed.forms[0].fields['RelayState'], 'SAMLResponse':parsed.forms[0].fields['SAMLResponse']})
|
session.post('https://www.moodle.tum.de/Shibboleth.sso/SAML2/POST',
|
||||||
|
data={'RelayState': parsed.forms[0].fields['RelayState'],
|
||||||
|
'SAMLResponse': parsed.forms[0].fields['SAMLResponse']})
|
||||||
|
|
||||||
|
return session
|
||||||
|
|
||||||
return session
|
|
||||||
|
|
||||||
def main(url, files, user='', passwd=''):
|
def main(url, files, user='', passwd=''):
|
||||||
# create session
|
# create session
|
||||||
session = None
|
t = clock()
|
||||||
if 'www.moodle.tum.de' in url:
|
if 'www.moodle.tum.de' in url:
|
||||||
session = establish_moodle_session(user, passwd)
|
session = establish_moodle_session(user, passwd)
|
||||||
else:
|
else:
|
||||||
session = Session()
|
session = Session()
|
||||||
session.auth = (user, passwd)
|
session.auth = (user, passwd)
|
||||||
session.headers = {
|
session.headers = {
|
||||||
"Accept-Language": "de-DE,de;"
|
"Accept-Language": "de-DE,de;"
|
||||||
}
|
}
|
||||||
|
#print("delta session:", clock() - t)
|
||||||
|
|
||||||
# get file links
|
|
||||||
links = get_file_links(session, url, files)
|
|
||||||
|
|
||||||
# download files
|
# get file links
|
||||||
download_files(session, links)
|
links = get_file_links(session, url, files)
|
||||||
|
|
||||||
|
# download files
|
||||||
|
#print(threading.active_count())
|
||||||
|
t1 = clock()
|
||||||
|
for l in links:
|
||||||
|
while threading.active_count() > NUM_THREADS:
|
||||||
|
sleep(0.02)
|
||||||
|
Thread(target=download_files, args=(session, l)).start()
|
||||||
|
|
||||||
|
while threading.active_count() > 1:
|
||||||
|
#print(threading.active_count())
|
||||||
|
sleep(0.5)
|
||||||
|
#print("delta download threaded:", clock() - t1)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user