forked from simonw/museums
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathannotate_timestamps.py
55 lines (49 loc) · 1.88 KB
/
annotate_timestamps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import git
import yaml
import json
import sqlite_utils
from sqlite_utils.db import NotFoundError
IGNORE_CHANGES_IN_COMMITS = {
# This commit updated all existing press dates to a new format
"78fa0ac54dcaa9c52e8962a44b574b082bc726d3"
}
def iterate_file_versions(repo_path, filepath, ref="main"):
repo = git.Repo(repo_path, odbt=git.GitDB)
commits = reversed(list(repo.iter_commits(ref, paths=filepath)))
for commit in commits:
blob = [b for b in commit.tree.blobs if b.name == filepath][0]
yield commit.committed_datetime, commit.hexsha, blob.data_stream.read()
if __name__ == "__main__":
ref = "main"
it = iterate_file_versions(".", "museums.yaml", ref)
previous = {}
created = {}
updated = {}
for when, hash, content in it:
try:
current = {m["id"]: m for m in yaml.safe_load(content)}
except (yaml.scanner.ScannerError, yaml.parser.ParserError):
# This must have been invalid YAML - skip
continue
# First detect the new museums
added_ids = [id for id in current if id not in previous]
for id in added_ids:
created[id] = when.isoformat()
updated[id] = when.isoformat()
# Now detect those that have changed since prev
if hash not in IGNORE_CHANGES_IN_COMMITS:
changed_ids = [
id
for id in current
if json.dumps(current[id], sort_keys=True, default=str)
!= json.dumps(previous.get(id, {}), sort_keys=True, default=str)
]
for id in changed_ids:
updated[id] = when.isoformat()
previous = current
db = sqlite_utils.Database("browse.db")
for id, ts in created.items():
try:
db["museums"].update(id, {"created": ts, "updated": updated[id]}, alter=True)
except NotFoundError:
pass