-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDownloadGitHubFiles.py
128 lines (112 loc) · 6.87 KB
/
DownloadGitHubFiles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import time
import os
import requests
import io
from collections import namedtuple
from IPython.display import clear_output
from enum import Enum
class DownloadGitHubFiles:
def __init__(self, url_list_filepath, is_testing=False):
self.url_list_filepath = url_list_filepath
self.is_testing = is_testing
self.error_messages = io.StringIO()
self.processed_result_counts = [0, 0, 0]
# Get GitHub token from environment variable.
self.github_token = os.environ.get('GITHUB_TOKEN')
if self.github_token is None:
raise Exception('Environment variable GITHUB_TOKEN must be set to a GitHub personal access token. See https://docs.github.com/en/github/authenticating-to-github/creating-a-personal-access-token.')
def print_status(self, current_status):
clear_output()
print(current_status)
print(f'{self.processed_result_counts[self._process_file_result.SUCCEEDED.value]} files downloaded, {self.processed_result_counts[self._process_file_result.FAILED.value]} failed, {self.processed_result_counts[self._process_file_result.SKIPPED.value]} skipped (already cached)')
if self.error_messages.getvalue():
print('\nErrors:')
print(self.error_messages.getvalue(), end='')
def download(self):
with open(self.url_list_filepath, 'r') as f:
for line in f:
file_url = line.strip()
self.print_status(f'Downloading {file_url}')
processed_result = self._process_file(file_url)
self.processed_result_counts[processed_result.value] += 1
if processed_result == self._process_file_result.SKIPPED:
continue
# Don't do lots of downloading in testing mode
max_testing_files = 2
if self.is_testing and self.processed_result_counts[self._process_file_result.SUCCEEDED.value] + self.processed_result_counts[self._process_file_result.FAILED.value] >= max_testing_files:
print(f'Limiting to {max_testing_files} file downloads in testing mode')
break
# Avoid hitting GitHub with too many rapid-fire requests.
# It seems that because the downloads (https://raw.githubusercontent.com) are not part of the GitHub API URL, the rate limit headers (x-ratelimit*) are not included.
# According to https://stackoverflow.com/questions/66522261/does-github-rate-limit-access-to-public-raw-files, there may be a rate limit of 5000 requests per hour.
sleep_time = .1 if self.is_testing else 60 * 60 / 5000 * 3 # quick results when testing, 3x the rate limit when downloading all the data
time.sleep(sleep_time)
self.print_status('Done')
class _process_file_result(Enum):
SUCCEEDED = 0
FAILED = 1
SKIPPED = 2
def _process_file(self, github_file_url):
parsed_url = self._parse_url(github_file_url)
local_filepath = self._get_local_filepath(parsed_url)
if os.path.exists(local_filepath):
return self._process_file_result.SKIPPED
download_url = self._get_download_url(parsed_url)
if self._download(download_url, local_filepath):
return self._process_file_result.SUCCEEDED
else:
return self._process_file_result.FAILED
ParsedUrl = namedtuple('ParsedUrl', ['owner', 'repo', 'commit_hash', 'filepath'])
def _parse_url(self, github_file_url):
# Example input: https://github.com/AIT-IES/FMITerminalBlock/blob/143403d06934acba5e434d7cb5a9158545e95f13/doc/user/tutorial-data/model/Sisyphus.fmu
# owner = AIT-IES
# repo = FMITerminalBlock
# commit hash = 143403d06934acba5e434d7cb5a9158545e95f13
# filepath = doc/user/tutorial-data/model/Sisyphus.fmu
split = github_file_url.split('/')
result = self.ParsedUrl(
owner = split[3],
repo = split[4],
commit_hash = split[6],
filepath = '/'.join(split[7:]),
)
assert(split[5] == 'blob')
return result
def _get_contents_info_url(self, parsed_url):
# Convert the GitHub URL to a URL that provides metadata about the file
# Example input: https://github.com/AIT-IES/FMITerminalBlock/blob/143403d06934acba5e434d7cb5a9158545e95f13/doc/user/tutorial-data/model/Sisyphus.fmu
# Example output: https://api.github.com/repos/AIT-IES/FMITerminalBlock/contents/doc/user/tutorial-data/model/Sisyphus.fmu?ref=143403d06934acba5e434d7cb5a9158545e95f13
result = f'https://api.github.com/repos/{parsed_url.owner}/{parsed_url.repo}/contents/{parsed_url.filepath}?ref={parsed_url.commit_hash}'
return result
def _get_download_url(self, parsed_url):
# Convert the GitHub URL to a download URL
# Example input: https://github.com/AIT-IES/FMITerminalBlock/blob/143403d06934acba5e434d7cb5a9158545e95f13/doc/user/tutorial-data/model/Sisyphus.fmu
# Example output: https://raw.githubusercontent.com/AIT-IES/FMITerminalBlock/143403d06934acba5e434d7cb5a9158545e95f13/doc/user/tutorial-data/model/Sisyphus.fmu
result = f'https://raw.githubusercontent.com/{parsed_url.owner}/{parsed_url.repo}/{parsed_url.commit_hash}/{parsed_url.filepath}'
return result
def _get_local_filepath(self, parsed_url):
# Convert the GitHub URL to a local file path
# Example input: https://github.com/AIT-IES/FMITerminalBlock/blob/143403d06934acba5e434d7cb5a9158545e95f13/doc/user/tutorial-data/model/Sisyphus.fmu
# Example output if url_list_filepath is "results/urls.txt": results/downloads/AIT-IES/FMITerminalBlock/doc_user_tutorial-data_model_Sisyphus.fmu
url_list_directory = os.path.dirname(self.url_list_filepath)
filepath_with_underscores = '_'.join(parsed_url.filepath.split('/'))
result = f'{url_list_directory}/downloads/{parsed_url.owner}/{parsed_url.repo}/{filepath_with_underscores}'
return result
def _download(self, download_url, local_filepath):
headers = {'Authorization': f'token {self.github_token}'}
response = requests.get(download_url, headers=headers, stream=True)
if response.status_code == 200:
# Stream the content of the response to a local file
# Create directory of local file, if needed
os.makedirs(os.path.dirname(local_filepath), exist_ok=True)
with open(local_filepath, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
return True
else:
# Print the error message if the request failed
error_message = f'{download_url}: {response.status_code} - {response.text}'
print(error_message)
self.error_messages.write(error_message)
self.error_messages.write('\n')
return False