From 40fe75340942e438133b7facdd4a5bfea2806e31 Mon Sep 17 00:00:00 2001 From: Christoph Gerum Date: Thu, 23 Nov 2023 13:56:42 +0000 Subject: [PATCH] Add documentation for experiment management --- doc/experiments.md | 131 ++++++++++++++++++ experiments/cifar10/experiment/sweep_lr.yaml | 9 ++ .../cifar10/experiment/sweep_models.yaml | 9 ++ hannah/models/factory/factory.py | 2 - hannah/modules/vision/image_classifier.py | 2 - pydoc-markdown.yml | 3 + 6 files changed, 152 insertions(+), 4 deletions(-) create mode 100644 doc/experiments.md create mode 100644 experiments/cifar10/experiment/sweep_lr.yaml create mode 100644 experiments/cifar10/experiment/sweep_models.yaml diff --git a/doc/experiments.md b/doc/experiments.md new file mode 100644 index 00000000..29e01abf --- /dev/null +++ b/doc/experiments.md @@ -0,0 +1,131 @@ + +# Experiment Mangement + + +It is common to create a new directory for each group of experiments, usually these are group around +a specific publication goal or project. + +Hydra configuration options are taken from one of the following possibilities. + +1. A local `config.yml` taken from the directory in which the command is run. +2. Configuration group overrides from any subdirectory of the current working directory named after a configuration group +4. Overrides from a special configuration group usually called `experiments` + +for an example on experiment management have a look at `experiments/cifar10`. + +It has the following directory structure: + + +``` +. +├── augmentation +│ └── cifar_augment.yaml +├── config.yaml +├── datasets +│ └── ... +├── experiment +│ ├── sweep_lr.yaml +│ └── sweep_models.yaml +├── scripts +│ └── train_slurm.sh +└── trained_models + ├── sweep_lr + | ├── 0.0001 + | ├── ... + └── sweep_models + ├── multirun.yaml + └── resnet18 + +``` + + +The main configuration is found in `config.yaml`. + +```yaml +defaults: + - base_config + - override dataset: cifar10 # Dataset configuration name + - override features: identity # Feature extractor configuration name (use identity for vision datasets) + - override model: timm_resnet18 #timm_mobilenetv3_small_100 # Neural network name (for now timm_resnet50 or timm_efficientnet_lite1) + - override scheduler: 1cycle # learning rate scheduler config name + - override optimizer: sgd # Optimizer config name + - override normalizer: null # Feature normalizer (used for quantized neural networks) + - override module: image_classifier # Lightning module config for the training loop (image classifier for image classification tasks) + - override augmentation: cifar_augment + - _self_ + + +monitor: + metric: val_f1_micro + direction: maximize + +module: + batch_size: 64 + +trainer: + max_epochs: 50 + +scheduler: + max_lr: 0.1 +``` + +The configuration is composed from the standard presets in `base_config` the defaults are then using different config group presets, using the +`- override : ` syntax. Most of the presets are taken from the package wide configuration in `hannah/conf`. The configuration for data augmentation is defined in: `augmentation/cifar_augment.yaml`: + +```yaml +batch_augment: + pipeline: null + transforms: + RandomVerticalFlip: + p: 0.5 + RandomCrop: + size: [32,32] + padding: 4 +``` + +This file specifies the presets for data augmentation shared among the experiments in this group of experiments. + +The experiments are then defined in for example `experiment/sweep_lr.yaml` and `experiment/sweep_model.yaml`. +These experiments can be started with: `hannah-train +experiment=`. Please note the **+** in front +of the commandline argument. This is needed as `experiment` is not part of default configuration hierarchy. + +As an example have a look at `experiment/sweep_lr.yaml`. + +```yaml +# @package _global_ +experiment_id: sweep_lr +hydra: + mode: MULTIRUN + sweep: + subdir: lr=${scheduler.max_lr} + sweeper: + params: + scheduler.max_lr: 0.0001,0.001,0.01,0.1 +``` + +Experiments must start with `# @package _global_` this means that overrides defined here change the global +configuration and not some values under `experiment`. The next line sets the `experiment_id` this is used +to identify the experiment and set a different subfolder for the output artifacts (normally: `trained_models/`). + +The final part of the configuration then configures a sweep over multiple parameters. In this case +we configure a sweep over the `max_lr` parameter of the used 1cycle learning rate scheduler, and configure the output directory +to contain a separate subdirector `lr=0.0001`, `lr=0.001` for each setting of the max_lr parameter. + +The final outputs can then be found in `trained_models/sweep_lr/lr\=0.0001/` and so on. diff --git a/experiments/cifar10/experiment/sweep_lr.yaml b/experiments/cifar10/experiment/sweep_lr.yaml new file mode 100644 index 00000000..d3424177 --- /dev/null +++ b/experiments/cifar10/experiment/sweep_lr.yaml @@ -0,0 +1,9 @@ +# @package _global_ +experiment_id: sweep_lr +hydra: + mode: MULTIRUN + sweep: + subdir: lr=${scheduler.max_lr} + sweeper: + params: + scheduler.max_lr: 0.0001,0.001,0.01,0.1 diff --git a/experiments/cifar10/experiment/sweep_models.yaml b/experiments/cifar10/experiment/sweep_models.yaml new file mode 100644 index 00000000..2e4d23e1 --- /dev/null +++ b/experiments/cifar10/experiment/sweep_models.yaml @@ -0,0 +1,9 @@ +# @package _global_ +experiment_id: sweep_models +hydra: + mode: MULTIRUN + sweep: + subdir: ${model.name} + sweeper: + params: + model: timm_resnet18,timm_mobilenetv3_small_075,timm_mobilenetv3_small_100 diff --git a/hannah/models/factory/factory.py b/hannah/models/factory/factory.py index 2dd23b97..f55002f4 100644 --- a/hannah/models/factory/factory.py +++ b/hannah/models/factory/factory.py @@ -1297,6 +1297,4 @@ def create_cnn( ) output_shape, model = factory.network(input_shape, labels, structured_config) - print(model) - return model diff --git a/hannah/modules/vision/image_classifier.py b/hannah/modules/vision/image_classifier.py index 0deccb4e..f42b5f2c 100644 --- a/hannah/modules/vision/image_classifier.py +++ b/hannah/modules/vision/image_classifier.py @@ -156,8 +156,6 @@ def on_validation_batch_end(self, outputs, batch, batch_idx, dataloader_idx=0): def on_validation_epoch_end(self): super().on_validation_epoch_end() - print(self.validation_res_df.head()) - def test_step(self, batch, batch_idx): _, step_results, batch, preds = self.common_step("test", batch, batch_idx) diff --git a/pydoc-markdown.yml b/pydoc-markdown.yml index 752bd69f..44513abe 100644 --- a/pydoc-markdown.yml +++ b/pydoc-markdown.yml @@ -41,6 +41,9 @@ renderer: - title: Publications name: publications source: doc/publications.md + - title: Experiment Management + name: experiments + source: doc/experiments.md - title: Configuration name: configuration children: