Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue1105 #169

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions experiments/issue1105/archive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
from pathlib import Path
import subprocess
import tarfile
from tempfile import TemporaryDirectory

ARCHIVE_HOST = "aifiles"
ARCHIVE_LOCATION = Path("experiments")

def add_archive_step(exp, path):
"""
Adds a step to the given experiment that will archive it to the
archive location specified in ARCHIVE_LOCATION und the given path.
We archive the following files:
- everything in the same directory as the main experiment script
(except for 'data', '.venv', and '__pycache__')
- all generated reports
- the combined properties file
- all run and error logs
- the source code stored in the experiment data directory
- any files added as resources to the experiment

The first two items in the above list will be stored unpacked for easier
access while all otherdata will be packed.
"""
def archive():
archive_path = ARCHIVE_LOCATION / path
_archive_script_dir(exp, ARCHIVE_HOST, archive_path)
_archive_eval_dir(exp, ARCHIVE_HOST, archive_path)
_archive_data_dir(exp, ARCHIVE_HOST, archive_path)

exp.add_step("archive", archive)


def _archive_script_dir(exp, host, archive_path):
"""
Archives everything except 'data', '.venv', and '__pycache__' from the
same directory as the experiment script at host:archive_path/scripts.
"""
script_dir = Path(exp._script).parent
target_path = archive_path / "scripts"

script_files = [f for f in script_dir.glob("*")
if f.name not in ["data", ".venv", "venv", "__pycache__"]]
_rsync(script_files, host, target_path)


def _archive_data_dir(exp, host, archive_path):
"""
Packs all files we want to archive from the experiment's data directory and
then archives the packed data at host:archive_path/data. Specifically, the
archived files are:
- all files directly in the data dir (added resources such as parsers)
- all directories starting with "code_" (source code of all revisions and
the compilied binaries)
- All *.log and *.err files from the run directories
"""
data_dir = Path(exp.path)
target_path = archive_path / "data"

data_files = [f for f in data_dir.glob("*") if f.is_file()]
data_files.extend([d for d in data_dir.glob("code-*") if d.is_dir()])
data_files.extend(data_dir.glob("runs*/*/*.log"))
data_files.extend(data_dir.glob("runs*/*/*.err"))
with TemporaryDirectory() as tmpdirname:
packed_filename = Path(tmpdirname) / (exp.name + ".tar.xz")
_pack(data_files, packed_filename, Path(exp.path).parent)
_rsync([packed_filename], host, target_path)


def _archive_eval_dir(exp, host, archive_path):
"""
Archives all files in the experiment's eval dir.
If there is a properties file, it will be packed and only the
packed version will be included in the resulting list.
"""
eval_dir = Path(exp.eval_dir)
target_path = archive_path / "data" / eval_dir.name

filenames = list(eval_dir.glob("*"))
properties = eval_dir / "properties"
if properties.exists():
filenames.remove(properties)
with TemporaryDirectory() as tmpdirname:
packed_properties = Path(tmpdirname) / "properties.tar.xz"
_pack([properties], packed_properties, eval_dir)
_rsync([packed_properties], host, target_path)
_rsync(filenames, host, target_path)


def _pack(filenames, archive_filename, path_prefix):
"""
Packs all files given in filenames into an archive (.tar.xz) located at
archive_filename. The path_prefix is removed in the archive, i.e.,
if the filename is '/path/to/file' and the prefix is '/path', the location
inside the archive will be 'to/file'.
"""
with tarfile.open(archive_filename, "w|xz") as f:
for name in filenames:
f.add(name, name.relative_to(path_prefix))

def _rsync(filenames, host, target_path):
# Before copying files we have to create the target path on host.
# We could use the rsync option --mkpath but it is only available in newer
# rsync versions (and not in the one running on the grid)
# https://stackoverflow.com/questions/1636889
subprocess.run(["ssh", host, "mkdir", "-p", target_path])
subprocess.run(["rsync", "-avz"] + [str(f) for f in filenames] + [f"{host}:{target_path}"])
Loading
Loading