diff --git a/configs/det/efl/efl_improved_baseline_r101_2x_rfs.yaml b/configs/det/efl/efl_improved_baseline_r101_2x_rfs.yaml new file mode 100644 index 00000000..2c20061d --- /dev/null +++ b/configs/det/efl/efl_improved_baseline_r101_2x_rfs.yaml @@ -0,0 +1,208 @@ +num_classes: &num_classes 1204 + +flip: &flip + type: flip + kwargs: + flip_p: 0.5 + +resize: &train_resize + type: keep_ar_resize + kwargs: + scales: [640, 672, 704, 736, 768, 800] + max_size: 1333 + separate_wh: True + +resize: &test_resize + type: keep_ar_resize + kwargs: + scales: [800] + max_size: 1333 + separate_wh: True + + +to_tensor: &to_tensor + type: to_tensor + +normalize: &normalize + type: normalize + kwargs: + mean: [0.485, 0.456, 0.406] # ImageNet pretrained statics + std: [0.229, 0.224, 0.225] + +dataset: # Required. + train: + dataset: + type: lvisv1 + kwargs: + meta_file: lvis/annotations/lvis_v1_train.json + image_reader: + type: fs_opencv + kwargs: + image_dir: lvis + color_mode: RGB + transformer: [*flip, *train_resize, *to_tensor, *normalize] + batch_sampler: + type: aspect_ratio_group + kwargs: + sampler: + type: repeat_factor + kwargs: + t: 0.001 + ri_mode: ceil + pn: 0.5 + static_size: False + batch_size: 1 + aspect_grouping: [1,] + test: + dataset: + type: lvisv1 + kwargs: + meta_file: >_file lvis/annotations/lvis_v1_val.json + image_reader: + type: fs_opencv + kwargs: + image_dir: lvis + color_mode: RGB + transformer: [*test_resize, *to_tensor, *normalize] + evaluator: + type: LVIS # choices = {'COCO', 'VOC', 'MR'} + kwargs: + gt_file: *gt_file + iou_types: [bbox] + batch_sampler: + type: aspect_ratio_group + kwargs: + sampler: + type: dist + kwargs: {} + batch_size: 1 + aspect_grouping: [1,] + dataloader: + type: base + kwargs: + num_workers: 4 + alignment: 64 + + +trainer: # Required. + max_epoch: 24 # total epochs for the training + test_freq: 24 + optimizer: # optimizer = SGD(params,lr=0.001,momentum=0.9,weight_decay=0.0001) + type: SGD + kwargs: + lr: 0.00125 + momentum: 0.9 + weight_decay: 0.0001 + lr_scheduler: # lr_scheduler = MultStepLR(optimizer, milestones=[9,14],gamma=0.1) + warmup_epochs: 1 # set to be 0 to disable warmup. When warmup, target_lr = init_lr * total_batch_size + warmup_type: linear + warmup_ratio: 0.001 # warmup init lr = warmup_ratio * base_lr + type: MultiStepLR + kwargs: + milestones: [16, 22] # epochs to decay lr + gamma: 0.1 # decay rate + +saver: # Required. + save_dir: checkpoints # dir to save checkpoints + pretrain_model: pretrain/pytorch/imagenet/resnet101-5d3b4d8f.pth + results_dir: results_dir # dir to save detection results. i.e., bboxes, masks, keypoints + auto_resume: True + +hooks: + - type: auto_checkpoint + - type: gradient_collector + kwargs: + hook_head: roi_head + hook_cls_head: cls_subnet_pred + - type: grad_clipper + kwargs: + mode: pre_defined + norm_type: 2 + max_norm: 35 + +net: + - name: backbone # backbone = resnet50(frozen_layers, out_layers, out_strides) + type: resnet101 + kwargs: + frozen_layers: [0,1] # layer0...1 is fixed + out_layers: [2,3,4] # layer1...4, commonly named Conv2...5 + out_strides: [8,16,32] # tell the strides of output features + normalize: + type: freeze_bn + initializer: + method: msra + - name: neck + prev: backbone + type: FPN + kwargs: + outplanes: 256 + start_level: 3 + num_level: 5 # if num_level>len(backbone.out_layers), additional conv with be stacked. + out_strides: [8,16,32,64,128] # strides of output features. aka., anchor strides for roi_head + downsample: conv # method to downsample, for FPN, it's pool, for RetienaNet, it's conv + upsample: nearest # method to interp, nearest or bilinear + initializer: + method: xavier + - name: roi_head + prev: neck + type: RetinaHeadWithIOU + kwargs: + feat_planes: 256 # channels of intermediate conv + num_classes: *num_classes # number of classes including backgroudn. for rpn, it's 2; for RetinaNet, it's 81 + initializer: + method: normal + std: 0.01 + normalize: + type: gn + kwargs: + num_groups: 32 + init_prior: 0.001 + num_anchors: 2 + class_activation: sigmoid + - name: post_process + prev: roi_head + type: retina_post_iou + kwargs: + num_classes: *num_classes # number of classes including backgroudn. for rpn, it's 2; for RetinaNet, it's 81 + cfg: + cls_loss: + type: equalized_focal_loss + kwargs: + num_classes: *num_classes + focal_alpha: 0.25 + focal_gamma: 2.0 + scale_factor: 8.0 + fpn_levels: 5 + loc_loss: + type: iou_loss + kwargs: + loss_type: giou + loss_weight: 1.0 + iou_branch_loss: + type: sigmoid_cross_entropy + kwargs: + loss_weight: 1.0 + anchor_generator: + type: hand_craft + kwargs: + anchor_ratios: [1] # anchor strides are provided as feature strides by feature extractor + anchor_scales: [6, 8] # scale of anchors relative to feature map + roi_supervisor: + type: atss + kwargs: + top_n: 18 + use_iou: True + roi_predictor: + type: base_multicls + kwargs: + pre_nms_score_thresh: 0 # to reduce computation + pre_nms_top_n: 6000 + post_nms_top_n: 1000 + roi_min_size: 0 # minimum scale of a valid roi + merger: + type: retina_multicls + kwargs: + top_n: 300 + nms: + type: naive + nms_iou_thresh: 0.5 # Required in RetinaNet. DO not nms in FPN across levels diff --git a/configs/det/efl/improved_baseline-r50_2x.yaml b/configs/det/efl/efl_improved_baseline_r50_2x_rfs.yaml similarity index 89% rename from configs/det/efl/improved_baseline-r50_2x.yaml rename to configs/det/efl/efl_improved_baseline_r50_2x_rfs.yaml index d74e44c6..eb04693d 100644 --- a/configs/det/efl/improved_baseline-r50_2x.yaml +++ b/configs/det/efl/efl_improved_baseline_r50_2x_rfs.yaml @@ -34,22 +34,34 @@ dataset: # Required. dataset: type: lvisv1 kwargs: - meta_file: annotations/lvis_v1_train.json + meta_file: lvis/annotations/lvis_v1_train.json image_reader: type: fs_opencv kwargs: - image_dir: coco + image_dir: lvis color_mode: RGB transformer: [*flip, *train_resize, *to_tensor, *normalize] + batch_sampler: + type: aspect_ratio_group + kwargs: + sampler: + type: repeat_factor + kwargs: + t: 0.001 + ri_mode: ceil + pn: 0.5 + static_size: False + batch_size: 1 + aspect_grouping: [1,] test: dataset: type: lvisv1 kwargs: - meta_file: >_file annotations/lvis_v1_val.json + meta_file: >_file lvis/annotations/lvis_v1_val.json image_reader: type: fs_opencv kwargs: - image_dir: coco + image_dir: lvis color_mode: RGB transformer: [*test_resize, *to_tensor, *normalize] evaluator: @@ -57,18 +69,14 @@ dataset: # Required. kwargs: gt_file: *gt_file iou_types: [bbox] - batch_sampler: - type: aspect_ratio_group - kwargs: - sampler: - type: repeat_factor - kwargs: - t: 0.001 - ri_mode: ceil - pn: 0.5 - static_size: False - batch_size: 2 - aspect_grouping: [1,] + batch_sampler: + type: aspect_ratio_group + kwargs: + sampler: + type: dist + kwargs: {} + batch_size: 1 + aspect_grouping: [1,] dataloader: type: base kwargs: diff --git a/configs/det/efl/efl_oids_r101_2x_random.yaml b/configs/det/efl/efl_oids_r101_2x_random.yaml new file mode 100644 index 00000000..1168f573 --- /dev/null +++ b/configs/det/efl/efl_oids_r101_2x_random.yaml @@ -0,0 +1,197 @@ +num_classes: &num_classes 501 + +flip: &flip + type: flip + kwargs: + flip_p: 0.5 + +resize: &train_resize + type: keep_ar_resize + kwargs: + scales: [800] + max_size: 1333 + +resize: &test_resize + type: keep_ar_resize + kwargs: + scales: [800] + max_size: 1333 + +to_tensor: &to_tensor + type: to_tensor + +normalize: &normalize + type: normalize + kwargs: + mean: [0.485, 0.456, 0.406] # ImageNet pretrained statics + std: [0.229, 0.224, 0.225] + +dataset: # Required. + train: + dataset: + type: coco + kwargs: + meta_file: openimages/annotations/openimages_challenge_2019_train_bbox.json + image_reader: + type: fs_opencv + kwargs: + image_dir: openimages/train + color_mode: RGB + transformer: [*flip, *train_resize, *to_tensor, *normalize] + batch_sampler: + type: aspect_ratio_group + kwargs: + sampler: + type: dist + kwargs: {} + batch_size: 1 + aspect_grouping: [1,] + test: + dataset: + type: coco + kwargs: + meta_file: >_file openimages/annotations/openimages_challenge_2019_val_bbox.json + image_reader: + type: fs_opencv + kwargs: + image_dir: openimages/validation + color_mode: RGB + transformer: [*test_resize, *to_tensor, *normalize] + batch_sampler: + type: aspect_ratio_group + kwargs: + sampler: + type: dist + kwargs: {} + batch_size: 1 + aspect_grouping: [1,] + dataloader: + type: base + kwargs: + num_workers: 4 + alignment: 64 + + +trainer: # Required. + max_epoch: 1.719425 # total epochs for the training + test_freq: 2 + optimizer: # optimizer = SGD(params,lr=0.001,momentum=0.9,weight_decay=0.0001) + type: SGD + kwargs: + lr: 0.00125 + momentum: 0.9 + weight_decay: 0.0001 + lr_scheduler: # lr_scheduler = MultStepLR(optimizer, milestones=[9,14],gamma=0.1) + warmup_epochs: 0.004776 # set to be 0 to disable warmup. When warmup, target_lr = init_lr * total_batch_size + warmup_type: linear + warmup_ratio: 0.001 # warmup init lr = warmup_ratio * base_lr + type: MultiStepLR + kwargs: + milestones: [1.146283, 1.528377] # epochs to decay lr + gamma: 0.1 # decay rate + +saver: # Required. + save_dir: checkpoints # dir to save checkpoints + pretrain_model: pretrain/pytorch/imagenet/resnet50-19c8e357.pth + results_dir: results_dir # dir to save detection results. i.e., bboxes, masks, keypoints + save_result: True + auto_resume: True + +hooks: + - type: auto_checkpoint + - type: gradient_collector + kwargs: + hook_head: roi_head + hook_cls_head: cls_subnet_pred + - type: grad_clipper + kwargs: + mode: pre_defined + norm_type: 2 + max_norm: 35 + +net: + - name: backbone # backbone = resnet50(frozen_layers, out_layers, out_strides) + type: resnet50 + kwargs: + frozen_layers: [0,1] # layer0...1 is fixed + out_layers: [2,3,4] # layer1...4, commonly named Conv2...5 + out_strides: [8,16,32] # tell the strides of output features + normalize: + type: freeze_bn + initializer: + method: msra + - name: neck + prev: backbone + type: FPN + kwargs: + outplanes: 256 + start_level: 3 + num_level: 5 # if num_level>len(backbone.out_layers), additional conv with be stacked. + out_strides: [8,16,32,64,128] # strides of output features. aka., anchor strides for roi_head + downsample: conv # method to downsample, for FPN, it's pool, for RetienaNet, it's conv + upsample: nearest # method to interp, nearest or bilinear + initializer: + method: xavier + - name: roi_head + prev: neck + type: RetinaHeadWithIOU + kwargs: + feat_planes: 256 # channels of intermediate conv + num_classes: *num_classes # number of classes including backgroudn. for rpn, it's 2; for RetinaNet, it's 81 + initializer: + method: normal + std: 0.01 + normalize: + type: gn + kwargs: + num_groups: 32 + init_prior: 0.001 + num_anchors: 2 + class_activation: sigmoid + - name: post_process + prev: roi_head + type: retina_post_iou + kwargs: + num_classes: *num_classes # number of classes including backgroudn. for rpn, it's 2; for RetinaNet, it's 81 + cfg: + cls_loss: + type: equalized_focal_loss + kwargs: + num_classes: *num_classes + focal_alpha: 0.25 + focal_gamma: 2.0 + scale_factor: 8.0 + fpn_levels: 5 + loc_loss: + type: iou_loss + kwargs: + loss_type: giou + loss_weight: 1.0 + iou_branch_loss: + type: sigmoid_cross_entropy + kwargs: + loss_weight: 1.0 + anchor_generator: + type: hand_craft + kwargs: + anchor_ratios: [1] # anchor strides are provided as feature strides by feature extractor + anchor_scales: [6, 8] # scale of anchors relative to feature map + roi_supervisor: + type: atss + kwargs: + top_n: 18 + use_iou: True + roi_predictor: + type: base_multicls + kwargs: + pre_nms_score_thresh: 0 # to reduce computation + pre_nms_top_n: 6000 + post_nms_top_n: 1000 + roi_min_size: 0 # minimum scale of a valid roi + merger: + type: retina_multicls + kwargs: + top_n: 300 + nms: + type: naive + nms_iou_thresh: 0.5 # Required in RetinaNet. DO not nms in FPN across levels diff --git a/configs/det/efl/efl_oids_r50_2x_random.yaml b/configs/det/efl/efl_oids_r50_2x_random.yaml new file mode 100644 index 00000000..8122af6c --- /dev/null +++ b/configs/det/efl/efl_oids_r50_2x_random.yaml @@ -0,0 +1,197 @@ +num_classes: &num_classes 501 + +flip: &flip + type: flip + kwargs: + flip_p: 0.5 + +resize: &train_resize + type: keep_ar_resize + kwargs: + scales: [800] + max_size: 1333 + +resize: &test_resize + type: keep_ar_resize + kwargs: + scales: [800] + max_size: 1333 + +to_tensor: &to_tensor + type: to_tensor + +normalize: &normalize + type: normalize + kwargs: + mean: [0.485, 0.456, 0.406] # ImageNet pretrained statics + std: [0.229, 0.224, 0.225] + +dataset: # Required. + train: + dataset: + type: coco + kwargs: + meta_file: openimages/annotations/openimages_challenge_2019_train_bbox.json + image_reader: + type: fs_opencv + kwargs: + image_dir: openimages/train + color_mode: RGB + transformer: [*flip, *train_resize, *to_tensor, *normalize] + batch_sampler: + type: aspect_ratio_group + kwargs: + sampler: + type: dist + kwargs: {} + batch_size: 1 + aspect_grouping: [1,] + test: + dataset: + type: coco + kwargs: + meta_file: >_file openimages/annotations/openimages_challenge_2019_val_bbox.json + image_reader: + type: fs_opencv + kwargs: + image_dir: openimages/validation + color_mode: RGB + transformer: [*test_resize, *to_tensor, *normalize] + batch_sampler: + type: aspect_ratio_group + kwargs: + sampler: + type: dist + kwargs: {} + batch_size: 1 + aspect_grouping: [1,] + dataloader: + type: base + kwargs: + num_workers: 4 + alignment: 64 + + +trainer: # Required. + max_epoch: 1.719425 # total epochs for the training + test_freq: 2 + optimizer: # optimizer = SGD(params,lr=0.001,momentum=0.9,weight_decay=0.0001) + type: SGD + kwargs: + lr: 0.00125 + momentum: 0.9 + weight_decay: 0.0001 + lr_scheduler: # lr_scheduler = MultStepLR(optimizer, milestones=[9,14],gamma=0.1) + warmup_epochs: 0.004776 # set to be 0 to disable warmup. When warmup, target_lr = init_lr * total_batch_size + warmup_type: linear + warmup_ratio: 0.001 # warmup init lr = warmup_ratio * base_lr + type: MultiStepLR + kwargs: + milestones: [1.146283, 1.528377] # epochs to decay lr + gamma: 0.1 # decay rate + +saver: # Required. + save_dir: checkpoints # dir to save checkpoints + pretrain_model: pretrain/pytorch/imagenet/resnet101-5d3b4d8f.pth + results_dir: results_dir # dir to save detection results. i.e., bboxes, masks, keypoints + save_result: True + auto_resume: True + +hooks: + - type: auto_checkpoint + - type: gradient_collector + kwargs: + hook_head: roi_head + hook_cls_head: cls_subnet_pred + - type: grad_clipper + kwargs: + mode: pre_defined + norm_type: 2 + max_norm: 35 + +net: + - name: backbone # backbone = resnet50(frozen_layers, out_layers, out_strides) + type: resnet101 + kwargs: + frozen_layers: [0,1] # layer0...1 is fixed + out_layers: [2,3,4] # layer1...4, commonly named Conv2...5 + out_strides: [8,16,32] # tell the strides of output features + normalize: + type: freeze_bn + initializer: + method: msra + - name: neck + prev: backbone + type: FPN + kwargs: + outplanes: 256 + start_level: 3 + num_level: 5 # if num_level>len(backbone.out_layers), additional conv with be stacked. + out_strides: [8,16,32,64,128] # strides of output features. aka., anchor strides for roi_head + downsample: conv # method to downsample, for FPN, it's pool, for RetienaNet, it's conv + upsample: nearest # method to interp, nearest or bilinear + initializer: + method: xavier + - name: roi_head + prev: neck + type: RetinaHeadWithIOU + kwargs: + feat_planes: 256 # channels of intermediate conv + num_classes: *num_classes # number of classes including backgroudn. for rpn, it's 2; for RetinaNet, it's 81 + initializer: + method: normal + std: 0.01 + normalize: + type: gn + kwargs: + num_groups: 32 + init_prior: 0.001 + num_anchors: 2 + class_activation: sigmoid + - name: post_process + prev: roi_head + type: retina_post_iou + kwargs: + num_classes: *num_classes # number of classes including backgroudn. for rpn, it's 2; for RetinaNet, it's 81 + cfg: + cls_loss: + type: equalized_focal_loss + kwargs: + num_classes: *num_classes + focal_alpha: 0.25 + focal_gamma: 2.0 + scale_factor: 8.0 + fpn_levels: 5 + loc_loss: + type: iou_loss + kwargs: + loss_type: giou + loss_weight: 1.0 + iou_branch_loss: + type: sigmoid_cross_entropy + kwargs: + loss_weight: 1.0 + anchor_generator: + type: hand_craft + kwargs: + anchor_ratios: [1] # anchor strides are provided as feature strides by feature extractor + anchor_scales: [6, 8] # scale of anchors relative to feature map + roi_supervisor: + type: atss + kwargs: + top_n: 18 + use_iou: True + roi_predictor: + type: base_multicls + kwargs: + pre_nms_score_thresh: 0 # to reduce computation + pre_nms_top_n: 6000 + post_nms_top_n: 1000 + roi_min_size: 0 # minimum scale of a valid roi + merger: + type: retina_multicls + kwargs: + top_n: 300 + nms: + type: naive + nms_iou_thresh: 0.5 # Required in RetinaNet. DO not nms in FPN across levels diff --git a/configs/det/efl/efl_yolox_medium.yaml b/configs/det/efl/efl_yolox_medium.yaml index 9f904cdb..0e77e807 100644 --- a/configs/det/efl/efl_yolox_medium.yaml +++ b/configs/det/efl/efl_yolox_medium.yaml @@ -66,11 +66,11 @@ dataset: dataset: type: lvisv1 kwargs: - meta_file: annotations/lvis_v1_train.json + meta_file: lvis/annotations/lvis_v1_train.json image_reader: type: fs_opencv kwargs: - image_dir: coco + image_dir: lvis color_mode: BGR transformer: [*mosaic, *random_perspective, *mixup, *augment_hsv, *flip, *train_resize, *to_tensor] batch_sampler: @@ -88,11 +88,11 @@ dataset: dataset: type: lvisv1 kwargs: - meta_file: >_file annotations/lvis_v1_val.json + meta_file: >_file lvis/annotations/lvis_v1_val.json image_reader: type: fs_opencv kwargs: - image_dir: coco + image_dir: lvis color_mode: BGR transformer: [*test_resize, *to_tensor] evaluator: diff --git a/configs/det/efl/efl_yolox_small.yaml b/configs/det/efl/efl_yolox_small.yaml index 445ce339..fae0b246 100644 --- a/configs/det/efl/efl_yolox_small.yaml +++ b/configs/det/efl/efl_yolox_small.yaml @@ -66,11 +66,11 @@ dataset: dataset: type: lvisv1 kwargs: - meta_file: annotations/lvis_v1_train.json + meta_file: lvis/annotations/lvis_v1_train.json image_reader: type: fs_opencv kwargs: - image_dir: coco + image_dir: lvis color_mode: BGR transformer: [*mosaic, *random_perspective, *mixup, *augment_hsv, *flip, *train_resize, *to_tensor] batch_sampler: @@ -88,11 +88,11 @@ dataset: dataset: type: lvisv1 kwargs: - meta_file: >_file annotations/lvis_v1_val.json + meta_file: >_file lvis/annotations/lvis_v1_val.json image_reader: type: fs_opencv kwargs: - image_dir: coco + image_dir: lvis color_mode: BGR transformer: [*test_resize, *to_tensor] evaluator: diff --git a/configs/det/efl/eqfl_yolox_medium.yaml b/configs/det/efl/eqfl_yolox_medium.yaml index 79280c26..fa257590 100644 --- a/configs/det/efl/eqfl_yolox_medium.yaml +++ b/configs/det/efl/eqfl_yolox_medium.yaml @@ -66,11 +66,11 @@ dataset: dataset: type: lvisv1 kwargs: - meta_file: annotations/lvis_v1_train.json + meta_file: lvis/annotations/lvis_v1_train.json image_reader: type: fs_opencv kwargs: - image_dir: coco + image_dir: lvis color_mode: BGR transformer: [*mosaic, *random_perspective, *mixup, *augment_hsv, *flip, *train_resize, *to_tensor] batch_sampler: @@ -88,11 +88,11 @@ dataset: dataset: type: lvisv1 kwargs: - meta_file: >_file annotations/lvis_v1_val.json + meta_file: >_file lvis/annotations/lvis_v1_val.json image_reader: type: fs_opencv kwargs: - image_dir: coco + image_dir: lvis color_mode: BGR transformer: [*test_resize, *to_tensor] evaluator: diff --git a/configs/det/efl/eqfl_yolox_samll.yaml b/configs/det/efl/eqfl_yolox_samll.yaml index 6b97c95a..22b1cb51 100644 --- a/configs/det/efl/eqfl_yolox_samll.yaml +++ b/configs/det/efl/eqfl_yolox_samll.yaml @@ -66,11 +66,11 @@ dataset: dataset: type: lvisv1 kwargs: - meta_file: annotations/lvis_v1_train.json + meta_file: lvis/annotations/lvis_v1_train.json image_reader: type: fs_opencv kwargs: - image_dir: coco + image_dir: lvis color_mode: BGR transformer: [*mosaic, *random_perspective, *mixup, *augment_hsv, *flip, *train_resize, *to_tensor] batch_sampler: @@ -88,11 +88,11 @@ dataset: dataset: type: lvisv1 kwargs: - meta_file: >_file annotations/lvis_v1_val.json + meta_file: >_file lvis/annotations/lvis_v1_val.json image_reader: type: fs_opencv kwargs: - image_dir: coco + image_dir: lvis color_mode: BGR transformer: [*test_resize, *to_tensor] evaluator: diff --git a/docs/equalized_focal_loss.md b/docs/equalized_focal_loss.md new file mode 100644 index 00000000..4ccc1cfc --- /dev/null +++ b/docs/equalized_focal_loss.md @@ -0,0 +1,177 @@ +# Equalized Focal Loss + +This repo is about the Equalized Focal Loss (EFL) and the Equalized Quality Focal Loss (EQFL) for one-stage long-tailed object detection. + +## Requirements + +- Python 3.6+ +- Pytorch 1.5.0+ +- CUDA 9.0+ +- [EOD](https://github.com/ModelTC/EOD) + +## Prepare Dataset (LVIS v1) +### images and annotations + +LVIS v1 uses same images as COCO's. Thus you only need to download COCO dataset at folder ($COCO) and link its `train`, `val`, and `test` to folder ($LVIS). +``` +# download $COCO/train, $COCO/val, and $COCO/test, firstly +mkdir $LVIS +ln -s $COCO/train $LVIS +ln -s $COCO/val $LVIS +ln -s $COCO/test $LVIS +``` +Then download the annotations from the official website of [LVIS](https://www.lvisdataset.org/dataset). The annotations should be placed at ($LVIS/annotations). +``` +cd $LVIS +mkdir annotations +# then download the annotations of lvis_v1_train.json and lvis_v1_val.json +``` +Finally the file structure of folder ($LVIS) will be like this: +``` +$LVIS + ├── annotations + │ ├── lvis_v1_val.json + │ ├── lvis_v1_train.json + ├── train2017 + │ ├── 000000004134.png + │ ├── 000000031817.png + │ ├── ...... + ├── val2017 + ├── test2017 +``` +### configs + +Modify the dataset config like this: +``` +dataset: + train: + dataset: + type: lvisv1 + kwargs: + meta_file: $LVIS/annotations/lvis_v1_train.json + image_reader: + type: fs_opencv + kwargs: + image_dir: $LVIS + color_mode: BGR + ... + test: + dataset: + type: lvisv1 + kwargs: + meta_file: >_file $LVIS/annotations/lvis_v1_val.json + image_reader: + type: fs_opencv + kwargs: + image_dir: $LVIS + color_mode: BGR + ... + evaluator: + type: LVIS + kwargs: + gt_file: *gt_file + iou_types: [bbox] +``` + +## Benchmark Results + +We provide the benchmark results of the EFL (Equalized Focal Loss) and the EQFL (Equalized Quality Focal Loss). +The results are divided into the improved baseline series and the YOLOX series. +All models are trained with the repeat factor sampler (RFS) with 16 GPUs settings. + +**Improved Baseline Series** + +|config | loss | pretrain | scheduler | AP | APr | APc | APf | weights | +|------|:---:|:---:|:---:|:---:|:---:|:---:|:---:|---:| +|[Res50](https://github.com/ModelTC/EOD/blob/main/configs/det/efl/efl_improved_baseline_r50_2x_rfs.yaml)| EFL | imagenet | 24e | 27.5 | 20.2 | 26.1 | 32.4 | [model](https://github.com/ModelTC/EOD/releases/download/0.1.0/efl_improved_baseline_r50.pth) | +|[Res101](https://github.com/ModelTC/EOD/blob/main/configs/det/efl/efl_improved_baseline_r50_2x_rfs.yaml) | EFL | imagenet | 24e | 29.2 | 23.5 | 27.4 | 33.8 | [model](https://github.com/ModelTC/EOD/releases/download/0.1.0/efl_improved_baseline_r101.pth) | + +**YOLOX-IP Series** + +|config | loss | pretrain | scheduler | AP | APr | APc | APf | weights | +|------|:---:|:---:|:---:|:---:|:---:|:---:|:---:|---:| +|[YOLOX-IP-S](https://github.com/ModelTC/EOD/blob/main/configs/det/efl/efl_yolox_small.yaml)| EFL | None | 300e | 23.3 | 18.1 | 21.2 | 28.0 | [model](https://github.com/ModelTC/EOD/releases/download/0.1.0/efl_yolox_small.pth) | +|[YOLOX-IP-S](https://github.com/ModelTC/EOD/blob/main/configs/det/efl/eqfl_yolox_small.yaml)| EQFL | None | 300e | 24.2 | 16.3 | 22.7 | 29.4 | [model](https://github.com/ModelTC/EOD/releases/download/0.1.0/eqfl_yolox_small.pth) | +|[YOLOX-IP-M](https://github.com/ModelTC/EOD/blob/main/configs/det/efl/efl_yolox_medium.yaml)| EFL | None | 300e | 30.0 | 23.8 | 28.2 | 34.7 | [model](https://github.com/ModelTC/EOD/releases/download/0.1.0/efl_yolox_medium.pth) | +|[YOLOX-IP-M](https://github.com/ModelTC/EOD/blob/main/configs/det/efl/eqfl_yolox_medium.yaml)| EQFL | None | 300e | 31.0 | 24.0 | 29.1 | 36.2 | [model](https://github.com/ModelTC/EOD/releases/download/0.1.0/eqfl_yolox_medium.pth) | + +## Testing with Pretrained Models + +For example, if you want to test the pretrained model of YOLOX-IP-M with the EQFL (Equlized Quailty Focal Loss): +``` +mkdir pretrain +# download the weight of YOLOX-IP-M with the EQFL (eqfl_yolox_medium.pth) +``` +Edit the saver config of `configs/det/efl/eqfl_yolox_medium.yaml`: +``` +saver: + save_dir: checkpoints/yolox_medium + pretrain_model: pretrain/eqfl_yolox_medium.pth + results_dir: results_dir/yolox_medium + auto_resume: True +``` +Then run the following commond to evaluate. +``` +python -m eod train -e --config configs/det/efl/eqfl_yolox_medium.yaml --nm 1 --ng 1 --launch pytorch 2>&1 | tee log.test +``` + +## Training + +It should be notice that our benchmark results are all come from 16 GPUs settings on the Nvidia A100. Since the 16 GPUs settings is not always available for all readers, we provide the 8 GPUs settings here. For example, you can run the following commond to train a model of the improved baseline with EFL (Res50): +``` +# please change the batch_size to 2 of the train batch_sampler in the +# configs/det/efl/efl_improved_baseline_r50_2x_rfs.yaml, firstly. + +python -m eod train --config configs/det/efl/efl_improved_baseline_r50_2x_rfs.yaml --nm 1 --ng 8 --launch pytorch 2>&1 | tee log.train +``` +Meanwhile, the gradient_collector hook in the configs of EFL and EQFL is of vital importance on the gradient collection mechanism. Please make sure it is open when train networks with these loss. +``` +hooks: + - type: gradient_collector + kwargs: + hook_head: roi_head # the classification head + hook_cls_head: cls_preds # the last layer of the classification head +``` +- Tip 1: our gradient collection mechanism only support the `fp32` settings. Training with `fp16` will get unexpected results. +- Tip 2: an alternate approach to obtain the gradient is by manual calculation just like the way in [EQLv2](https://github.com/tztztztztz/eqlv2/blob/master/mmdet/models/losses/eqlv2.py#L90). One thing need to be noticed is that you need to write you own code to calculate the gradient from the derivative of EFl or EQFL. + + +## How to Train EFL on OpenImages + +Our EOD framework support the training and inference of OIDs (but can not evaluate the results directly). Most of our steps are following the guide in [EQLv2](https://github.com/tztztztztz/eqlv2#how-to-train-eqlv2-on-openimages). + +Here are the steps to get a result of OIDs (the bold steps are the operations that need to be noticed, other steps are same as them in EQLv2): +- Download the data. +- Convert the `.csv` to coco-like `.json` file. +- **Train and inference the models** + ``` + # edit image and annotation paths in + # configs/det/efl/efl_oids_r50_2x_random.yaml, firstly. + # then change the batch_size to 2 for 8 GPUs settings. + # we train the model with a 120k/160k/180k scheduler with random sampler. + + python -m eod train --config configs/det/efl/efl_oids_r50_2x_random.yaml --nm 1 --ng 8 --launch pytorch 2>&1 | tee log.train + ``` +- **Obtain the inference results `results.txt.all` under the `results_dir`** +- Convert coco-like `results.txt.all` result file to openimage like `.csv` file +- Evaluate results file using official API +- Parse the AP file and output the grouped AP + +Our OIDs results are: +``` +# Res50 +mAP 0.5151686921063541 +mAP0: 0.5277291485206159 +mAP1: 0.5289892622419364 +mAP2: 0.5082855060435439 +mAP3: 0.5017385315742882 +mAP4: 0.5094778258438134 + +# Res101 +mAP 0.5255129979062584 +mAP0: 0.53395152741142 +mAP1: 0.5381753139992043 +mAP2: 0.5139450335860504 +mAP3: 0.5182665336563826 +mAP4: 0.5234797367633899 +```