Milan Verespej | 6e372ca | 2019-07-08 12:42:30 +0200 | [diff] [blame] | 1 | #! /usr/bin/env python3 |
Milan Verespej | 2e1328a | 2019-06-18 13:40:08 +0200 | [diff] [blame] | 2 | # -*- coding: utf-8 -*- |
| 3 | |
| 4 | # COPYRIGHT NOTICE STARTS HERE |
| 5 | |
| 6 | # Copyright 2019 © Samsung Electronics Co., Ltd. |
| 7 | # |
| 8 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 9 | # you may not use this file except in compliance with the License. |
| 10 | # You may obtain a copy of the License at |
| 11 | # |
| 12 | # http://www.apache.org/licenses/LICENSE-2.0 |
| 13 | # |
| 14 | # Unless required by applicable law or agreed to in writing, software |
| 15 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 17 | # See the License for the specific language governing permissions and |
| 18 | # limitations under the License. |
| 19 | |
| 20 | # COPYRIGHT NOTICE ENDS HERE |
| 21 | |
| 22 | import argparse |
| 23 | import datetime |
| 24 | import logging |
| 25 | import os |
| 26 | import sys |
| 27 | import timeit |
| 28 | |
| 29 | import requests |
| 30 | from retrying import retry |
| 31 | |
| 32 | import http_file |
| 33 | from concurrent_downloader import ConcurrentDownloader |
| 34 | |
| 35 | log = logging.getLogger(__name__) |
| 36 | |
| 37 | |
| 38 | class HttpDownloader(ConcurrentDownloader): |
Milan Verespej | 4c7e892 | 2019-06-18 13:46:48 +0200 | [diff] [blame] | 39 | def __init__(self, *list_args, list_type='http_files', workers=None): |
| 40 | super().__init__(list_type, *list_args, workers=workers) |
Milan Verespej | 2e1328a | 2019-06-18 13:40:08 +0200 | [diff] [blame] | 41 | |
| 42 | @property |
| 43 | def check_table(self): |
| 44 | """ |
| 45 | Table with information what items from lists are downloaded |
| 46 | """ |
| 47 | self.missing() |
| 48 | header = ['Name', 'Downloaded'] |
| 49 | return self._check_table(header, {'Name': 'l'}, |
| 50 | ((item, item not in self._missing) for item |
| 51 | in self._data_list)) |
| 52 | |
| 53 | @staticmethod |
| 54 | def _make_get_request(url): |
| 55 | """ |
| 56 | Run http get request |
| 57 | :param url: url to reqeuest |
| 58 | :return: requests.Response |
| 59 | """ |
| 60 | req = requests.get(url) |
| 61 | req.raise_for_status() |
| 62 | return req |
| 63 | |
| 64 | def _is_missing(self, item): |
| 65 | """ |
| 66 | Check if item is missing (not downloaded) |
| 67 | :param item: item to check |
| 68 | :return: boolean |
| 69 | """ |
| 70 | return not os.path.isfile( |
| 71 | '{}/{}'.format(self._data_list[item], item.rsplit('//')[-1])) |
| 72 | |
| 73 | @retry(stop_max_attempt_number=5, wait_fixed=2000) |
| 74 | def _get_file(self, file_uri): |
| 75 | """ |
| 76 | Get http file from uri |
| 77 | :param file_uri: uri of the file |
| 78 | :return: file content |
| 79 | """ |
| 80 | if not file_uri.startswith('http'): |
| 81 | file_uri = 'http://' + file_uri |
| 82 | file_req = self._make_get_request(file_uri) |
| 83 | return file_req.content |
| 84 | |
| 85 | def _download_item(self, item): |
| 86 | """ |
| 87 | Download http file |
| 88 | :param item: http file to be downloaded (tuple: (uri, dst_dir)) |
| 89 | """ |
| 90 | log.info('Downloading: {}'.format(item[0])) |
| 91 | dst_path = '{}/{}'.format(item[1], item[0].rsplit('//')[-1]) |
| 92 | try: |
| 93 | f = http_file.HttpFile(item[0], self._get_file(item[0]), dst_path) |
| 94 | f.save_to_file() |
| 95 | except Exception as err: |
| 96 | log.exception('Error downloading: {}: {}'.format(item[0], err)) |
| 97 | if os.path.isfile(dst_path): |
| 98 | os.remove(dst_path) |
| 99 | raise err |
| 100 | log.info('Downloaded: {}'.format(f.name)) |
| 101 | |
| 102 | |
| 103 | def run_cli(): |
| 104 | """ |
| 105 | Run as cli tool |
| 106 | """ |
| 107 | parser = argparse.ArgumentParser(description='Download http files from list') |
| 108 | parser.add_argument('file_list', metavar='file-list', |
| 109 | help='File with list of http files to download') |
| 110 | parser.add_argument('--output-dir', '-o', default=os.getcwd(), |
| 111 | help='Destination directory for saving') |
| 112 | parser.add_argument('--check', '-c', action='store_true', default=False, |
| 113 | help='Check mode') |
| 114 | parser.add_argument('--debug', action='store_true', default=False, |
| 115 | help='Turn on debug output') |
| 116 | parser.add_argument('--workers', type=int, default=None, |
| 117 | help='Set maximum workers for parallel download (default: cores * 5)') |
| 118 | |
| 119 | args = parser.parse_args() |
| 120 | |
| 121 | if args.debug: |
| 122 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) |
| 123 | else: |
| 124 | logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s') |
| 125 | |
| 126 | downloader = HttpDownloader([args.file_list, args.output_dir], workers=args.workers) |
| 127 | |
| 128 | if args.check: |
| 129 | log.info('Check mode. No download will be executed.') |
| 130 | log.info(downloader.check_table) |
| 131 | sys.exit(0) |
| 132 | |
| 133 | timer_start = timeit.default_timer() |
| 134 | try: |
| 135 | downloader.download() |
| 136 | except RuntimeError: |
| 137 | sys.exit(1) |
| 138 | finally: |
| 139 | log.info('Downloading finished in {}'.format( |
| 140 | datetime.timedelta(seconds=timeit.default_timer() - timer_start))) |
| 141 | |
| 142 | |
| 143 | if __name__ == '__main__': |
| 144 | run_cli() |