Source code for fundamentals.download.multiobject_download

#!/usr/local/bin/python
# encoding: utf-8
"""
*Download resources from a list of URLs.

There are options to rename all the downloaded resource, index the files, set differing download locations and pass basic authentication credentials.*

Author
: David Young
"""

from __future__ import print_function
from future import standard_library

standard_library.install_aliases()
from builtins import zip
from builtins import str
import sys
import os

os.environ["TERM"] = "vt100"
from fundamentals import tools
import urllib
import requests
from multiprocessing.pool import ThreadPool

gtimeout = 10.0
llog = ""
import random
import time


[docs] def multiobject_download( urlList, downloadDirectory, log, timeStamp=True, timeout=180, concurrentDownloads=10, resetFilename=False, credentials=False, longTime=False, indexFilenames=False, ): """ *get multiple url documents and place them in specified download directory/directories* **Key Arguments** - ``urlList`` -- list of document urls - ``downloadDirectory`` -- directory(ies) to download the documents to - can be one directory path or a list of paths the same length as urlList - ``log`` -- the logger - ``timestamp`` -- append a timestamp the name of the URL (ensure unique filenames) - ``longTime`` -- use a longer timestamp when appending to the filename (greater uniqueness) - ``timeout`` -- the timeout limit for downloads (secs) - ``concurrentDownloads`` -- the number of concurrent downloads allowed at any one time - ``resetFilename`` -- a string to reset all filenames to - ``credentials`` -- basic http credentials { 'username' : "...", "password", "..." } - ``indexFilenames`` -- prepend filenames with index (where url appears in urllist) **Return** - list of timestamped documents (same order as the input urlList) **Usage** ```python # download the pages linked from the main list page from fundamentals.download import multiobject_download localUrls = multiobject_download( urlList=["https://www.python.org/dev/peps/pep-0257/","https://en.wikipedia.org/wiki/Docstring"], downloadDirectory="/tmp", log="log", timeStamp=True, timeout=180, concurrentDownloads=2, resetFilename=False, credentials=False, # { 'username' : "...", "password", "..." } longTime=True ) print localUrls # OUT: ['/tmp/untitled_20160316t160650610780.html', # '/tmp/Docstring_20160316t160650611136.html'] ``` .. image:: https://i.imgur.com/QYoMm24.png width=600px """ import sys import os import re from fundamentals.download import ( append_now_datestamp_to_filename, extract_filename_from_url, ) # TIMEOUT IN SECONDS global gtimeout global llog llog = log gtimeout = float(timeout) # BUILD THE 2D ARRAY FOR MULTI_THREADED DOWNLOADS thisArray = [] bodies = [] localUrls = [] theseUrls = [] requestList = [] totalCount = len(urlList) # IF ONLY ONE DOWNLOAD DIRECORY if not isinstance(downloadDirectory, list): for i, url in enumerate(urlList): # EXTRACT THE FILENAME FROM THE URL if resetFilename and len(resetFilename): filename = resetFilename[i] else: filename = extract_filename_from_url(log, url) if indexFilenames: filename = """%(i)03d_%(filename)s""" % locals() if not filename: from datetime import datetime, date, time now = datetime.now() filename = now.strftime("%Y%m%dt%H%M%S%f") if timeStamp: # APPEND TIMESTAMP TO THE FILENAME filename = append_now_datestamp_to_filename( log, filename, longTime=longTime ) # GENERATE THE LOCAL FILE URL localFilepath = downloadDirectory + "/" + filename # ADD BASIC AUTH TO THE URLS if credentials != False: url_pass = f'{credentials["username"]}:{credentials["password"]}@' if "://" in url: url = url.replace("://", "://" + url_pass) else: url = url_pass + url thisArray.extend([[url, localFilepath]]) else: for url, d in zip(urlList, downloadDirectory): # EXTRACT THE FILENAME FROM THE URL if resetFilename: filename = resetFilename else: filename = extract_filename_from_url(log, url) if not filename: continue if timeStamp: # APPEND TIMESTAMP TO THE FILENAME filename = append_now_datestamp_to_filename(log, filename) # GENERATE THE LOCAL FILE URL localFilepath = d + "/" + filename thisArray.extend([[url, localFilepath]]) # ADD BASIC AUTH TO THE URLS if credentials != False: url_pass = f'{credentials["username"]}:{credentials["password"]}@' if "://" in url: url = url.replace("://", "://" + url_pass) else: url = url_pass + url # CONCURRENTLY DOWNLOAD URLS results = ThreadPool(concurrentDownloads).imap_unordered(fetch_url, thisArray) urlNum = 0 returnPaths = [] for path in results: returnPaths.append(path) urlNum += 1 if urlNum > 1: # CURSOR UP ONE LINE AND CLEAR LINE sys.stdout.write("\x1b[1A\x1b[2K") percent = (float(urlNum) / float(totalCount)) * 100.0 print( " %(urlNum)s / %(totalCount)s (%(percent)1.1f%%) URLs downloaded" % locals() ) localPaths = [] localPaths[:] = [o[1] for o in thisArray if o[1] in returnPaths] return localPaths
[docs] def fetch_url(entry): downloaded = False tries = 5 count = 0 uri, path = entry timeout = gtimeout randSleep = random.randint(1, 101) / 20.0 time.sleep(randSleep) while not downloaded and count < tries: try: r = requests.get(uri, stream=True, timeout=timeout) except: count += 1 timeout *= 2 llog.warning( f"timeout on attempt number {count}/{tries}. Increasing to {timeout}s" ) continue if r.status_code == 200: with open(path, "wb") as f: for chunk in r: f.write(chunk) return path else: count += 1 llog.warning( f"Getting status code {r.status_code} on download attempt {count}/{tries}." ) downloaded = False return None