From 1cf3c81df46cd11274a40e3432b1765b730b8242 Mon Sep 17 00:00:00 2001
From: ZHUI <85285159+longzw1997@users.noreply.github.com>
Date: Thu, 19 Oct 2023 14:19:13 +0800
Subject: [PATCH] update spec/fixed typo

---
 README.md                        | 99 +++++++++++---------------------
 data_format.md                   |  6 ++
 test_dist.sh                     |  4 +-
 test_slurm.sh                    |  4 +-
 train_dist.sh                    |  4 +-
 train_slrum.sh => train_slurm.sh |  4 +-
 6 files changed, 48 insertions(+), 73 deletions(-)
 rename train_slrum.sh => train_slurm.sh (76%)
diff --git a/README.md b/README.md
index 489b32e..b0ffb77 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,7 @@
 <div align="center">
-  <img src="figs/cute_dino.png" width="40%">
+  <img src="figs/cute_dino.png" width="35%">
 </div>
 
-
 # Open GroundingDino
 
 This is the third party implementation of the paper **[Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499)** by [Zuwei Long]() and [Wei Li](https://github.com/bigballon).
@@ -12,7 +11,7 @@ This is the third party implementation of the paper **[Grounding DINO: Marrying
 
 # Supported Features
 
-|                                | Official release version | The Version We Replicated |
+|                                | Official release version | The version we replicated |
 | ------------------------------ | ------------------------ | ------------------------- |
 | Inference                      | &#10004;                 | &#10004;                  |
 | Train (Objecet Detection data) | &#10006;                 | &#10004;                  |
@@ -24,7 +23,7 @@ This is the third party implementation of the paper **[Grounding DINO: Marrying
 
 # Setup
 
-We test our models under ```python=3.7.11,pytorch=1.11.0,cuda=11.3```. Other versions might be available as well. 
+We test our models under ``python=3.7.11, pytorch=1.11.0, cuda=11.3``. Other versions might be available as well. 
 
 1. Clone the GroundingDINO repository from GitHub.
 
@@ -38,24 +37,11 @@ git clone https://github.com/longzw1997/Open-GroundingDino.git && cd Open-Ground
 pip install -r requirements.txt 
 cd models/GroundingDINO/ops
 python setup.py build install
-# unit test (should see all checking is True)
 python test.py
 cd ../../..
 ```
 
-3. Download [pre-trained model](https://github.com/IDEA-Research/GroundingDINO/releases) and [BERT](https://huggingface.co/bert-base-uncased) weights.
-
-```bash
-mkdir weights
-cd weights
-wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
-cd ..
-mkdir bert_weights
-cd bert_weights
-wget -q https://drive.google.com/drive/folders/1eM1HYf2K161YPzIcRDDMzE7S4WBGmDLM?usp=share_link
-cd ..
-```
-
+3. Download [pre-trained model](https://github.com/IDEA-Research/GroundingDINO/releases) and [BERT](https://huggingface.co/bert-base-uncased) weights, then modify the corresponding paths in the train/test script.
 
 # Dataset
 
@@ -141,79 +127,64 @@ config/datasets_mixed_odvg.json      # support mixed dataset for both OD and VG
 
 # Training
 
-* Before starting the training, you need to modify the  ''config/datasets_vg_example.json'' according to ''data_format.md''
-* The evaluation code defaults to using coco_val2017 for evaluation. If you are evaluating with your own test set, you need to convert the test data to coco format (not the ovdg format in data_format.md), and modify the config to set use_coco_eval = False. (The COCO dataset has 80 classes used for training but 90 categories in total, so there is a built-in mapping in the code.)
+- Before starting the training, you need to modify the  ``config/datasets_vg_example.json`` according to ``data_format.md``.
+- The evaluation code defaults to using coco_val2017 for evaluation. If you are evaluating with your own test set, you need to convert the test data to coco format (not the ovdg format in data_format.md), and modify the config to set **use_coco_eval = False** (The COCO dataset has 80 classes used for training but 90 categories in total, so there is a built-in mapping in the code).
 
 
 ```  bash
-# train/eval on slrum cluster：
-bash train_slrum.sh  ${PARTITION} ${GPU_NUM} ${CFG} ${DATASETS} ${OUTPUT_DIR}
-bash test_slrum.sh  ${PARTITION} ${GPU_NUM} ${CFG} ${DATASETS} ${OUTPUT_DIR}
-# e.g.  check train_slrum.sh for more details
-# bash train_slrum.sh v100_32g 32 config/cfg_odvg.py config/datasets_mixed_odvg.json ./logs
-# bash train_slrum.sh v100_32g 8 config/cfg_coco.py config/datasets_od_example.json ./logs
-
 # train/eval on torch.distributed.launch:
 bash train_dist.sh  ${GPU_NUM} ${CFG} ${DATASETS} ${OUTPUT_DIR}
 bash test_dist.sh  ${GPU_NUM} ${CFG} ${DATASETS} ${OUTPUT_DIR}
-```
 
+# train/eval on slurm cluster：
+bash train_slurm.sh  ${PARTITION} ${GPU_NUM} ${CFG} ${DATASETS} ${OUTPUT_DIR}
+bash test_slurm.sh  ${PARTITION} ${GPU_NUM} ${CFG} ${DATASETS} ${OUTPUT_DIR}
+# e.g.  check train_slurm.sh for more details
+# bash train_slurm.sh v100_32g 32 config/cfg_odvg.py config/datasets_mixed_odvg.json ./logs
+# bash train_slurm.sh v100_32g 8 config/cfg_coco.py config/datasets_od_example.json ./logs
+```
 
 # Results and Models
 
-<!-- insert a table -->
-<table>
+<table style="font-size:11px;" >
   <thead>
     <tr style="text-align: right;">
-      <th></th>
       <th>Name</th>
-      <th>Backbone</th>
-      <th>Style</th>
       <th>Pretrain data</th>
+      <th>Task</th>
       <th>mAP on COCO</th>
-      <th>Checkpoint</th>
-      <th>Config</th>
-      <th>log</th>
+      <th>Ckpt</th>
+      <th>Misc</th>
     </tr>
   </thead>
   <tbody>
     <tr>
-      <th>1</th>
-      <td>GroundingDINO-T (offical)</td>
-      <td>Swin-T</td>
-      <td>zero-shot</td>
+      <td>GroundingDINO-T<br>(offical)</td>
       <td>O365,GoldG,Cap4M</td>
-      <td>48.4 (zero-shot) </td>
-      <td><a href="https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth">GitHub link</a> 
-      <td>link</a></td>
-      <td>link</a></td>
+      <td>zero-shot</td>
+      <td>48.4<br>(zero-shot)</td>
+      <td><a href="https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth">model</a> 
+      <td> - </td>
     </tr>
-        <tr>
-      <th>2</th>
-      <td>GroundingDINO-T (finetune) </td>
-      <td>Swin-T</td>
-      <td>use coco finetune</td>
+      <td>GroundingDINO-T<br>(fine-tune)</td>
       <td>O365,GoldG,Cap4M</td>
-      <td>57.3 (fine-tune)</td>
-      <td><a href="https://drive.google.com/file/d/1H9xWCUr1vhrxM9lvENfJUxsXv44JaDee/view?usp=drive_link">GitHub link</a> 
-      <td><a href="https://drive.google.com/file/d/1TJRAiBbVwj3AfxvQAoi1tmuRfXH1hLie/view?usp=drive_link">link</a></td>
-      <td><a href="https://drive.google.com/file/d/1u8XyvBug56SrJY85UtMZFPKUIzV3oNV6/view?usp=drive_link">link</a></td>
+      <td>finetune<br>w/ coco</td>
+      <td><b>57.3</b><br>(fine-tune)</td>
+      <td><a href="https://drive.google.com/file/d/1H9xWCUr1vhrxM9lvENfJUxsXv44JaDee/view?usp=drive_link">model</a> 
+      <td><a href="https://drive.google.com/file/d/1TJRAiBbVwj3AfxvQAoi1tmuRfXH1hLie/view?usp=drive_link">cfg</a> | <a href="https://drive.google.com/file/d/1u8XyvBug56SrJY85UtMZFPKUIzV3oNV6/view?usp=drive_link">log</a></td>
     </tr>
     <tr>
-      <th>3</th>
-      <td>GroundingDINO-T (pretrain)</td>
-      <td>Swin-T</td>
+      <td>GroundingDINO-T<br>(pretrain)</td>
+      <td>COCO,O365,LIVS,V3Det,<br>GRIT-200K,Flickr30k(total 1.8M)</td>
       <td>zero-shot</td>
-      <td>COCO,Objects365,LIVS,V3Det,GRIT-200K,Flickr30k  (total 1.8M)</td>
-      <td>55.1 (zero-shot)</td>
-      <td><a href="https://drive.google.com/file/d/1ayAVNuIXXTGSJv7AyECdibFJVhlFjyux/view?usp=drive_link">GitHub link</a>  
-      <td><a href='https://drive.google.com/file/d/1LwtkvBHkP1OkErKBsVfwjcedVXkyocA5/view?usp=drive_link'>link</a></td>
-      <td><a href="https://drive.google.com/file/d/1kBEFk14OqcYHC7DPdA_BGtk2TBQkJtrL/view?usp=drive_link">link</a></td>
+      <td><b>55.1</b><br>(zero-shot)</td>
+      <td><a href="https://drive.google.com/file/d/1ayAVNuIXXTGSJv7AyECdibFJVhlFjyux/view?usp=drive_link">model</a>  
+      <td><a href='https://drive.google.com/file/d/1LwtkvBHkP1OkErKBsVfwjcedVXkyocA5/view?usp=drive_link'>cfg</a> | <a href="https://drive.google.com/file/d/1kBEFk14OqcYHC7DPdA_BGtk2TBQkJtrL/view?usp=drive_link">log</a></td>
     </tr>
   </tbody>
 </table>
 
-GRIT-200K generated by [GLIP](https://github.com/microsoft/GLIP) and [spaCy](https://spacy.io/).
+- [GRIT](https://huggingface.co/datasets/zzliang/GRIT)-200K generated by [GLIP](https://github.com/microsoft/GLIP) and [spaCy](https://spacy.io/).
 
 
 # Contact
@@ -221,7 +192,6 @@ GRIT-200K generated by [GLIP](https://github.com/microsoft/GLIP) and [spaCy](htt
 - longzuwei at sensetime.com  
 - liwei1 at sensetime.com  
 
-Any discussions, suggestions and questions are welcome!
 # Acknowledgments
 
 Provided codes were adapted from:
@@ -242,5 +212,4 @@ Provided codes were adapted from:
 }
 ```
 
-Feel free to contact me if you have any suggestions or questions, issues are welcome,
-create a PR if you find any bugs or you want to contribute. 
+Feel free to contact we if you have any suggestions or questions. Bugs found are also welcome. Please create a pull request if you find any bugs or want to contribute code.
diff --git a/data_format.md b/data_format.md
index 8f91e7b..72eb739 100644
--- a/data_format.md
+++ b/data_format.md
@@ -1,6 +1,8 @@
 
 # DATASETS file
+
 #### e.g.  ''config/datasets_mixed_odvg.json''
+
 The 'train' supports multiple datasets for simultaneous training, and 'dataset_model' needs to be set to 'odvg'. 
 The 'val'  only supports datasets in the COCO format, so 'dataset_model' should be set to 'coco', and 'label_map' should be set to null."
 ```json
@@ -28,13 +30,16 @@ The 'val'  only supports datasets in the COCO format, so 'dataset_model' should
   ]
 }
 ```
+
 # label_map:
+
  In dictionary form, indices start from "0" (it is essential to start from 0 to accommodate caption/grounding data). Here is an example:
 ```json
 {"0": "person", "1": "bicycle", "2": "car", "3": "motorcycle", "4": "airplane", "5": "bus", "6": "train", "7": "truck", "8": "boat", "9": "traffic light", "10": "fire hydrant", "11": "stop sign", "12": "parking meter", "13": "bench", "14": "bird", "15": "cat", "16": "dog", "17": "horse", "18": "sheep", "19": "cow", "20": "elephant", "21": "bear", "22": "zebra", "23": "giraffe", "24": "backpack", "25": "umbrella", "26": "handbag", "27": "tie", "28": "suitcase", "29": "frisbee", "30": "skis", "31": "snowboard", "32": "sports ball", "33": "kite", "34": "baseball bat", "35": "baseball glove", "36": "skateboard", "37": "surfboard", "38": "tennis racket", "39": "bottle", "40": "wine glass", "41": "cup", "42": "fork", "43": "knife", "44": "spoon", "45": "bowl", "46": "banana", "47": "apple", "48": "sandwich", "49": "orange", "50": "broccoli", "51": "carrot", "52": "hot dog", "53": "pizza", "54": "donut", "55": "cake", "56": "chair", "57": "couch", "58": "potted plant", "59": "bed", "60": "dining table", "61": "toilet", "62": "tv", "63": "laptop", "64": "mouse", "65": "remote", "66": "keyboard", "67": "cell phone", "68": "microwave", "69": "oven", "70": "toaster", "71": "sink", "72": "refrigerator", "73": "book", "74": "clock", "75": "vase", "76": "scissors", "77": "teddy bear", "78": "hair drier", "79": "toothbrush"}
 ```
 
 # odvg Dataset Format
+
 The files are in jsonl format, with one json object per line, as follows:
 Object Detection datasets utilize the 'detection' field. If dealing with an Object Detection dataset, an additional 'label_map' is required in the Dataset settings.
 Visual Grounding datasets employ the 'grounding' field.
@@ -77,5 +82,6 @@ Visual Grounding datasets employ the 'grounding' field.
     }
 }
 ```
+
 You can refer to the tools in "./tools "to convert other formats to ovdg data formats.
 
diff --git a/test_dist.sh b/test_dist.sh
index c1bd151..429fb4a 100644
--- a/test_dist.sh
+++ b/test_dist.sh
@@ -12,5 +12,5 @@ python -m torch.distributed.launch  --nproc_per_node=${GPU_NUM} main.py \
         --eval \
         -c ${CFG} \
         --datasets ${DATASETS}  \
-        --pretrain_model_path ./weights/groundingdino_swint_ogc.pth \
-        --options text_encoder_type=./bert_weights/bert-base-uncased
+        --pretrain_model_path /path/to/groundingdino_swint_ogc.pth \
+        --options text_encoder_type=/path/to/bert-base-uncased
diff --git a/test_slurm.sh b/test_slurm.sh
index 04e768e..a90969e 100644
--- a/test_slurm.sh
+++ b/test_slurm.sh
@@ -17,5 +17,5 @@ srun -p ${PARTITION} \
         -c ${CFG} \
         --eval \
         --datasets ${DATASETS}  \
-        --pretrain_model_path ./weights/groundingdino_swint_ogc.pth \
-        --options text_encoder_type=./bert_weights/bert-base-uncased
\ No newline at end of file
+        --pretrain_model_path /path/to/groundingdino_swint_ogc.pth \
+        --options text_encoder_type=/path/to/bert-base-uncased
diff --git a/train_dist.sh b/train_dist.sh
index 894695a..49956f8 100644
--- a/train_dist.sh
+++ b/train_dist.sh
@@ -11,5 +11,5 @@ python -m torch.distributed.launch  --nproc_per_node=${GPU_NUM} main.py \
         --output_dir ${OUTPUT_DIR} \
         -c ${CFG} \
         --datasets ${DATASETS}  \
-        --pretrain_model_path ./weights/groundingdino_swint_ogc.pth \
-        --options text_encoder_type=./bert_weights/bert-base-uncased
+        --pretrain_model_path /path/to/groundingdino_swint_ogc.pth \
+        --options text_encoder_type=/path/to/bert-base-uncased
diff --git a/train_slrum.sh b/train_slurm.sh
similarity index 76%
rename from train_slrum.sh
rename to train_slurm.sh
index 1d390f6..379e03c 100644
--- a/train_slrum.sh
+++ b/train_slurm.sh
@@ -17,5 +17,5 @@ srun -p ${PARTITION} \
     python -u main.py --output_dir ${OUTPUT_DIR} \
         -c ${CFG} \
         --datasets ${DATASETS}  \
-        --pretrain_model_path ./weights/groundingdino_swint_ogc.pth \
-        --options text_encoder_type=./bert_weights/bert-base-uncased
\ No newline at end of file
+        --pretrain_model_path /path/to/groundingdino_swint_ogc.pth \
+        --options text_encoder_type=/path/to/bert-base-uncased

	Name	Backbone	Style	Pretrain data	Task	mAP on COCO	Checkpoint	Config	log	Ckpt	Misc
1	GroundingDINO-T (offical)	Swin-T	zero-shot	GroundingDINO-T (offical)	O365,GoldG,Cap4M	48.4 (zero-shot)	GitHub link -	link	link	zero-shot	48.4 (zero-shot)	model +	-
2	GroundingDINO-T (finetune)	Swin-T	use coco finetune	GroundingDINO-T (fine-tune)	O365,GoldG,Cap4M	57.3 (fine-tune)	GitHub link -	link	link	finetune w/ coco	57.3 (fine-tune)	model +	cfg \| log
3	GroundingDINO-T (pretrain)	Swin-T	GroundingDINO-T (pretrain)	COCO,O365,LIVS,V3Det, GRIT-200K,Flickr30k(total 1.8M)	zero-shot	COCO,Objects365,LIVS,V3Det,GRIT-200K,Flickr30k (total 1.8M)	55.1 (zero-shot)	GitHub link -	link	link	55.1 (zero-shot)	model +	cfg \| log